The Global Power Plant Database is an open source database of power plants worldwide, containing information on approximately 35,000 power plants from 167 countries. It includes both thermal and renewable plants, with data on plant capacity, generation, ownership, and fuel type. The database is continuously updated as new data becomes available. The dataset link provided is for the power plants in India.
# importing the required libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
#Uploading database
data = pd.read_csv('database_IND.csv')
data.shape
(907, 27)
data.head()
| country | country_long | name | gppd_idnr | capacity_mw | latitude | longitude | primary_fuel | other_fuel1 | other_fuel2 | ... | year_of_capacity_data | generation_gwh_2013 | generation_gwh_2014 | generation_gwh_2015 | generation_gwh_2016 | generation_gwh_2017 | generation_gwh_2018 | generation_gwh_2019 | generation_data_source | estimated_generation_gwh | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | IND | India | ACME Solar Tower | WRI1020239 | 2.5 | 28.1839 | 73.2407 | Solar | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | IND | India | ADITYA CEMENT WORKS | WRI1019881 | 98.0 | 24.7663 | 74.6090 | Coal | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | IND | India | AES Saurashtra Windfarms | WRI1026669 | 39.2 | 21.9038 | 69.3732 | Wind | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | IND | India | AGARTALA GT | IND0000001 | 135.0 | 23.8712 | 91.3602 | Gas | NaN | NaN | ... | 2019.0 | NaN | 617.789264 | 843.747 | 886.004428 | 663.774500 | 626.239128 | NaN | Central Electricity Authority | NaN |
| 4 | IND | India | AKALTARA TPP | IND0000002 | 1800.0 | 21.9603 | 82.4091 | Coal | Oil | NaN | ... | 2019.0 | NaN | 3035.550000 | 5916.370 | 6243.000000 | 5385.579736 | 7279.000000 | NaN | Central Electricity Authority | NaN |
5 rows × 27 columns
data.isnull().sum()
country 0 country_long 0 name 0 gppd_idnr 0 capacity_mw 0 latitude 46 longitude 46 primary_fuel 0 other_fuel1 709 other_fuel2 906 other_fuel3 907 commissioning_year 380 owner 565 source 0 url 0 geolocation_source 19 wepp_id 907 year_of_capacity_data 388 generation_gwh_2013 907 generation_gwh_2014 509 generation_gwh_2015 485 generation_gwh_2016 473 generation_gwh_2017 467 generation_gwh_2018 459 generation_gwh_2019 907 generation_data_source 458 estimated_generation_gwh 907 dtype: int64
data.describe()
| capacity_mw | latitude | longitude | other_fuel3 | commissioning_year | wepp_id | year_of_capacity_data | generation_gwh_2013 | generation_gwh_2014 | generation_gwh_2015 | generation_gwh_2016 | generation_gwh_2017 | generation_gwh_2018 | generation_gwh_2019 | estimated_generation_gwh | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 907.000000 | 861.000000 | 861.000000 | 0.0 | 527.000000 | 0.0 | 519.0 | 0.0 | 398.000000 | 422.000000 | 434.000000 | 440.000000 | 448.000000 | 0.0 | 0.0 |
| mean | 326.223755 | 21.197918 | 77.464907 | NaN | 1997.091082 | NaN | 2019.0 | NaN | 2431.823590 | 2428.226946 | 2467.936859 | 2547.759305 | 2600.804099 | NaN | NaN |
| std | 590.085456 | 6.239612 | 4.939316 | NaN | 17.082868 | NaN | 0.0 | NaN | 4026.440035 | 4194.596959 | 4162.884308 | 4196.991169 | 4314.880456 | NaN | NaN |
| min | 0.000000 | 8.168900 | 68.644700 | NaN | 1927.000000 | NaN | 2019.0 | NaN | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | NaN | NaN |
| 25% | 16.725000 | 16.773900 | 74.256200 | NaN | 1988.000000 | NaN | 2019.0 | NaN | 223.557672 | 176.381062 | 188.285252 | 177.874930 | 193.378250 | NaN | NaN |
| 50% | 59.200000 | 21.780000 | 76.719500 | NaN | 2001.000000 | NaN | 2019.0 | NaN | 801.123775 | 711.181225 | 737.205450 | 817.977250 | 751.644375 | NaN | NaN |
| 75% | 385.250000 | 25.512400 | 79.440800 | NaN | 2012.000000 | NaN | 2019.0 | NaN | 3035.306250 | 3084.121250 | 3282.861313 | 3275.690475 | 3143.535900 | NaN | NaN |
| max | 4760.000000 | 34.649000 | 95.408000 | NaN | 2018.000000 | NaN | 2019.0 | NaN | 28127.000000 | 30539.000000 | 30015.000000 | 35116.000000 | 35136.000000 | NaN | NaN |
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 907 entries, 0 to 906 Data columns (total 27 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 country 907 non-null object 1 country_long 907 non-null object 2 name 907 non-null object 3 gppd_idnr 907 non-null object 4 capacity_mw 907 non-null float64 5 latitude 861 non-null float64 6 longitude 861 non-null float64 7 primary_fuel 907 non-null object 8 other_fuel1 198 non-null object 9 other_fuel2 1 non-null object 10 other_fuel3 0 non-null float64 11 commissioning_year 527 non-null float64 12 owner 342 non-null object 13 source 907 non-null object 14 url 907 non-null object 15 geolocation_source 888 non-null object 16 wepp_id 0 non-null float64 17 year_of_capacity_data 519 non-null float64 18 generation_gwh_2013 0 non-null float64 19 generation_gwh_2014 398 non-null float64 20 generation_gwh_2015 422 non-null float64 21 generation_gwh_2016 434 non-null float64 22 generation_gwh_2017 440 non-null float64 23 generation_gwh_2018 448 non-null float64 24 generation_gwh_2019 0 non-null float64 25 generation_data_source 449 non-null object 26 estimated_generation_gwh 0 non-null float64 dtypes: float64(15), object(12) memory usage: 191.4+ KB
sns.countplot(x='primary_fuel', data=data)
<AxesSubplot:xlabel='primary_fuel', ylabel='count'>
sns.distplot(data['capacity_mw'])
<AxesSubplot:xlabel='capacity_mw', ylabel='Density'>
sns.boxplot(x='primary_fuel', y='capacity_mw', data=data)
<AxesSubplot:xlabel='primary_fuel', ylabel='capacity_mw'>
# showing the most columns
pd.set_option('display.max_columns',None)
df=pd.read_csv('database_IND.csv')
temp=df.copy()
df.head(5)
| country | country_long | name | gppd_idnr | capacity_mw | latitude | longitude | primary_fuel | other_fuel1 | other_fuel2 | other_fuel3 | commissioning_year | owner | source | url | geolocation_source | wepp_id | year_of_capacity_data | generation_gwh_2013 | generation_gwh_2014 | generation_gwh_2015 | generation_gwh_2016 | generation_gwh_2017 | generation_gwh_2018 | generation_gwh_2019 | generation_data_source | estimated_generation_gwh | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | IND | India | ACME Solar Tower | WRI1020239 | 2.5 | 28.1839 | 73.2407 | Solar | NaN | NaN | NaN | 2011.0 | Solar Paces | National Renewable Energy Laboratory | http://www.nrel.gov/csp/solarpaces/project_det... | National Renewable Energy Laboratory | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | IND | India | ADITYA CEMENT WORKS | WRI1019881 | 98.0 | 24.7663 | 74.6090 | Coal | NaN | NaN | NaN | NaN | Ultratech Cement ltd | Ultratech Cement ltd | http://www.ultratechcement.com/ | WRI | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | IND | India | AES Saurashtra Windfarms | WRI1026669 | 39.2 | 21.9038 | 69.3732 | Wind | NaN | NaN | NaN | NaN | AES | CDM | https://cdm.unfccc.int/Projects/DB/DNV-CUK1328... | WRI | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | IND | India | AGARTALA GT | IND0000001 | 135.0 | 23.8712 | 91.3602 | Gas | NaN | NaN | NaN | 2004.0 | NaN | Central Electricity Authority | http://www.cea.nic.in/ | WRI | NaN | 2019.0 | NaN | 617.789264 | 843.747 | 886.004428 | 663.774500 | 626.239128 | NaN | Central Electricity Authority | NaN |
| 4 | IND | India | AKALTARA TPP | IND0000002 | 1800.0 | 21.9603 | 82.4091 | Coal | Oil | NaN | NaN | 2015.0 | NaN | Central Electricity Authority | http://www.cea.nic.in/ | WRI | NaN | 2019.0 | NaN | 3035.550000 | 5916.370 | 6243.000000 | 5385.579736 | 7279.000000 | NaN | Central Electricity Authority | NaN |
df['source'].nunique()
191
df['geolocation_source'].nunique()
3
print(" we have ",df.shape[0],'- Rows and ',df.shape[1],'-Columns')
we have 907 - Rows and 27 -Columns
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 907 entries, 0 to 906 Data columns (total 27 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 country 907 non-null object 1 country_long 907 non-null object 2 name 907 non-null object 3 gppd_idnr 907 non-null object 4 capacity_mw 907 non-null float64 5 latitude 861 non-null float64 6 longitude 861 non-null float64 7 primary_fuel 907 non-null object 8 other_fuel1 198 non-null object 9 other_fuel2 1 non-null object 10 other_fuel3 0 non-null float64 11 commissioning_year 527 non-null float64 12 owner 342 non-null object 13 source 907 non-null object 14 url 907 non-null object 15 geolocation_source 888 non-null object 16 wepp_id 0 non-null float64 17 year_of_capacity_data 519 non-null float64 18 generation_gwh_2013 0 non-null float64 19 generation_gwh_2014 398 non-null float64 20 generation_gwh_2015 422 non-null float64 21 generation_gwh_2016 434 non-null float64 22 generation_gwh_2017 440 non-null float64 23 generation_gwh_2018 448 non-null float64 24 generation_gwh_2019 0 non-null float64 25 generation_data_source 449 non-null object 26 estimated_generation_gwh 0 non-null float64 dtypes: float64(15), object(12) memory usage: 191.4+ KB
df[df['latitude'].isnull()].head(45)
| country | country_long | name | gppd_idnr | capacity_mw | latitude | longitude | primary_fuel | other_fuel1 | other_fuel2 | other_fuel3 | commissioning_year | owner | source | url | geolocation_source | wepp_id | year_of_capacity_data | generation_gwh_2013 | generation_gwh_2014 | generation_gwh_2015 | generation_gwh_2016 | generation_gwh_2017 | generation_gwh_2018 | generation_gwh_2019 | generation_data_source | estimated_generation_gwh | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 14 | IND | India | ANOOPGARH ST I&II | IND0000012 | 9.00 | NaN | NaN | Hydro | NaN | NaN | NaN | 1987.0 | NaN | Central Electricity Authority | http://www.cea.nic.in/ | WRI | NaN | 2019.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 25 | IND | India | Abhijeet | WRI1020238 | 50.00 | NaN | NaN | Solar | NaN | NaN | NaN | 2015.0 | Ener-t International Ltd. | National Renewable Energy Laboratory | http://www.nrel.gov/csp/solarpaces/project_det... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 42 | IND | India | BAGLIHAR HEP-II | IND0000021 | 450.00 | NaN | NaN | Hydro | NaN | NaN | NaN | 2015.0 | NaN | Central Electricity Authority | http://www.cea.nic.in/ | WRI | NaN | 2019.0 | NaN | NaN | 55.3220 | 1750.18510 | 1812.84025 | 1848.620450 | NaN | Central Electricity Authority | NaN |
| 45 | IND | India | BALCO TPP | IND0000025 | 600.00 | NaN | NaN | Coal | Oil | NaN | NaN | 2015.0 | NaN | Central Electricity Authority | http://www.cea.nic.in/ | WRI | NaN | 2019.0 | NaN | NaN | 1437.9547 | 2800.00000 | 2120.68000 | 2553.000000 | NaN | Central Electricity Authority | NaN |
| 67 | IND | India | BCPL Wind Farm | WRI1026675 | 5.00 | NaN | NaN | Wind | NaN | NaN | NaN | NaN | Bharat Petroleum Corporation Limited | CDM | https://cdm.unfccc.int/Projects/DB/DNV-CUK1218... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 72 | IND | India | BELLARY DG | IND0000044 | 25.20 | NaN | NaN | Oil | NaN | NaN | NaN | 2000.0 | NaN | Central Electricity Authority | http://www.cea.nic.in/ | WRI | NaN | 2019.0 | NaN | 0.00 | 0.0000 | 0.00000 | 0.00000 | 0.000000 | NaN | Central Electricity Authority | NaN |
| 178 | IND | India | DIMBE | IND0000102 | 5.00 | NaN | NaN | Hydro | NaN | NaN | NaN | 1997.0 | NaN | Central Electricity Authority | http://www.cea.nic.in/ | WRI | NaN | 2019.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 192 | IND | India | Davangere Wind Farm | WRI1026100 | 125.00 | NaN | NaN | Wind | NaN | NaN | NaN | NaN | NaN | CDM | https://cdm.unfccc.int/Projects/DB/DNV-CUK1142... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 201 | IND | India | E.G. CANAL | IND0000110 | 15.00 | NaN | NaN | Hydro | NaN | NaN | NaN | 1996.0 | NaN | Central Electricity Authority | http://www.cea.nic.in/ | WRI | NaN | 2019.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 208 | IND | India | Essel Mining Wind Farm | WRI1026101 | 75.00 | NaN | NaN | Wind | NaN | NaN | NaN | NaN | NaN | CDM | https://cdm.unfccc.int/filestorage/O/6/L/O6LOW... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 250 | IND | India | Gujurat Solar On | WRI1020243 | 28.00 | NaN | NaN | Solar | NaN | NaN | NaN | 2014.0 | Cargo Solar Power | National Renewable Energy Laboratory | http://www.nrel.gov/csp/solarpaces/project_det... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 329 | IND | India | KASHANG INTEGRATED | IND0000517 | 195.00 | NaN | NaN | Hydro | NaN | NaN | NaN | 2016.0 | NaN | Central Electricity Authority | http://www.cea.nic.in/ | WRI | NaN | 2019.0 | NaN | NaN | NaN | 55.80955 | 196.14435 | 117.648800 | NaN | Central Electricity Authority | NaN |
| 330 | IND | India | KASHIPUR CCPP | IND0000518 | 225.00 | NaN | NaN | Gas | NaN | NaN | NaN | 2016.0 | NaN | Central Electricity Authority | http://www.cea.nic.in/ | WRI | NaN | 2019.0 | NaN | NaN | NaN | 462.28260 | 1312.40000 | 815.984131 | NaN | Central Electricity Authority | NaN |
| 332 | IND | India | KATGHORA TPP | IND0000187 | 35.00 | NaN | NaN | Coal | Oil | NaN | NaN | 2012.0 | NaN | Central Electricity Authority | http://www.cea.nic.in/ | WRI | NaN | 2019.0 | NaN | 0.00 | 0.0000 | 0.00000 | 0.00000 | 0.000000 | NaN | Central Electricity Authority | NaN |
| 386 | IND | India | KVK Energy Solar Project | WRI1020244 | 100.00 | NaN | NaN | Solar | NaN | NaN | NaN | 2013.0 | KVK Energy Ventures Ltd | National Renewable Energy Laboratory | http://www.nrel.gov/csp/solarpaces/project_det... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 420 | IND | India | LEIMAKHONG DG | IND0000231 | 36.00 | NaN | NaN | Oil | NaN | NaN | NaN | 1999.0 | NaN | Central Electricity Authority | http://www.cea.nic.in/ | WRI | NaN | 2019.0 | NaN | 0.00 | 0.0000 | 0.00000 | 0.00000 | 0.000000 | NaN | Central Electricity Authority | NaN |
| 422 | IND | India | LIKIM RO | IND0000233 | 24.00 | NaN | NaN | Hydro | NaN | NaN | NaN | 2001.0 | NaN | Central Electricity Authority | http://www.cea.nic.in/ | WRI | NaN | 2019.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 430 | IND | India | LOWER LAGYAP | IND0000237 | 12.00 | NaN | NaN | Hydro | NaN | NaN | NaN | 1979.0 | NaN | Central Electricity Authority | http://www.cea.nic.in/ | WRI | NaN | 2019.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 435 | IND | India | LVS POWER DG | IND0000241 | 36.80 | NaN | NaN | Oil | NaN | NaN | NaN | 2000.0 | NaN | Central Electricity Authority | http://www.cea.nic.in/ | WRI | NaN | 2019.0 | NaN | 0.00 | 0.0000 | 0.00000 | 0.00000 | 0.000000 | NaN | Central Electricity Authority | NaN |
| 440 | IND | India | Landewadi Wind Farm | WRI1026790 | 4.00 | NaN | NaN | Wind | NaN | NaN | NaN | NaN | Serum Institute of India Limited | CDM | https://cdm.unfccc.int/Projects/DB/LRQA%20Ltd1... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 466 | IND | India | MAQSOODPUR IPP | IND0000259 | 90.00 | NaN | NaN | Coal | Oil | NaN | NaN | 2011.0 | NaN | Central Electricity Authority | http://www.cea.nic.in/ | WRI | NaN | 2019.0 | NaN | 473.91 | 281.7300 | 378.68000 | 150.95200 | 126.494100 | NaN | Central Electricity Authority | NaN |
| 478 | IND | India | MOHAMAD PUR | IND0000270 | 9.30 | NaN | NaN | Hydro | NaN | NaN | NaN | 1949.0 | NaN | Central Electricity Authority | http://www.cea.nic.in/ | WRI | NaN | 2019.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 479 | IND | India | MOHARA | IND0000271 | 9.00 | NaN | NaN | Hydro | NaN | NaN | NaN | 1962.0 | NaN | Central Electricity Authority | http://www.cea.nic.in/ | WRI | NaN | 2019.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 483 | IND | India | MOYAGCHU | IND0000274 | 4.00 | NaN | NaN | Hydro | NaN | NaN | NaN | 1993.0 | NaN | Central Electricity Authority | http://www.cea.nic.in/ | WRI | NaN | 2019.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 502 | IND | India | Megha Solar Plant | WRI1020245 | 50.00 | NaN | NaN | Solar | NaN | NaN | NaN | 2014.0 | Megha Engineering and Infrastructure | National Renewable Energy Laboratory | http://www.nrel.gov/csp/solarpaces/project_det... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 538 | IND | India | NIRGAJANI(Ganga Canal) | IND0000304 | 5.00 | NaN | NaN | Hydro | NaN | NaN | NaN | 1937.0 | NaN | Central Electricity Authority | http://www.cea.nic.in/ | WRI | NaN | 2019.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 540 | IND | India | NIWARI TPP | IND0000305 | 45.00 | NaN | NaN | Coal | Oil | NaN | NaN | 2013.0 | NaN | Central Electricity Authority | http://www.cea.nic.in/ | WRI | NaN | 2019.0 | NaN | 298.28 | 124.9248 | 208.78880 | 77.33440 | 68.094400 | NaN | Central Electricity Authority | NaN |
| 545 | IND | India | NSL Wind Farm | WRI1026676 | 27.65 | NaN | NaN | Wind | NaN | NaN | NaN | NaN | Nuziveedu Seeds limited | CDM | https://cdm.unfccc.int/Projects/DB/DNV-CUK1173... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 655 | IND | India | RSMNL Wind Farm | WRI1026679 | 22.50 | NaN | NaN | Wind | NaN | NaN | NaN | NaN | Rajasthan State Mines and Minerals Limited | CDM | https://cdm.unfccc.int/Projects/DB/BVQI1201770... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 674 | IND | India | Ruchi Soya Palsodi Wind Farm | WRI1026678 | 22.50 | NaN | NaN | Wind | NaN | NaN | NaN | NaN | Rucha Soya Industries Limited | CDM | https://cdm.unfccc.int/Projects/DB/RWTUV128515... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 680 | IND | India | SAINJ | IND0000534 | 100.00 | NaN | NaN | Hydro | NaN | NaN | NaN | 2017.0 | NaN | Central Electricity Authority | http://www.cea.nic.in/ | WRI | NaN | 2019.0 | NaN | NaN | NaN | NaN | 134.31505 | 406.765950 | NaN | Central Electricity Authority | NaN |
| 705 | IND | India | SHAHPUR | IND0000402 | 6.60 | NaN | NaN | Hydro | NaN | NaN | NaN | 1997.0 | NaN | Central Electricity Authority | http://www.cea.nic.in/ | WRI | NaN | 2019.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 711 | IND | India | SHIRPUR | IND0000535 | 150.00 | NaN | NaN | Coal | Oil | NaN | NaN | 2017.0 | NaN | Central Electricity Authority | http://www.cea.nic.in/ | WRI | NaN | 2019.0 | NaN | NaN | NaN | NaN | 4.31340 | 0.000000 | NaN | Central Electricity Authority | NaN |
| 712 | IND | India | SHIVAPURA | IND0000407 | 18.00 | NaN | NaN | Hydro | NaN | NaN | NaN | 1992.0 | NaN | Central Electricity Authority | http://www.cea.nic.in/ | WRI | NaN | 2019.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 731 | IND | India | SONE EAST CANAL | IND0000419 | 3.30 | NaN | NaN | Hydro | NaN | NaN | NaN | 1996.0 | NaN | Central Electricity Authority | http://www.cea.nic.in/ | WRI | NaN | 2019.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 732 | IND | India | SONE WEST CANAL | IND0000420 | 6.60 | NaN | NaN | Hydro | NaN | NaN | NaN | 1993.0 | NaN | Central Electricity Authority | http://www.cea.nic.in/ | WRI | NaN | 2019.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 741 | IND | India | SURAT GARH | IND0000429 | 4.00 | NaN | NaN | Hydro | NaN | NaN | NaN | 1992.0 | NaN | Central Electricity Authority | http://www.cea.nic.in/ | WRI | NaN | 2019.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 770 | IND | India | Sivalaperi Wind Farm | WRI1026677 | 24.00 | NaN | NaN | Wind | NaN | NaN | NaN | NaN | Green Infra Wind Power Projects Limited | CDM | https://cdm.unfccc.int/Projects/DB/RWTUV135219... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 773 | IND | India | Sterling Agro Akal Wind Farm | WRI1026754 | 3.00 | NaN | NaN | Wind | NaN | NaN | NaN | NaN | Sterling Agro Industries ltd. | CDM | https://cdm.unfccc.int/Projects/DB/SGS-UKL1333... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 774 | IND | India | Sterling Agro Mokla Wind Farm | WRI1026755 | 6.30 | NaN | NaN | Wind | NaN | NaN | NaN | NaN | Sterling Agro Industries ltd. | CDM | https://cdm.unfccc.int/Projects/DB/SGS-UKL1333... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 775 | IND | India | Sterling Karnataka site | WRI1026756 | 6.25 | NaN | NaN | Wind | NaN | NaN | NaN | NaN | Sterling Agro Industries ltd. | CDM | https://cdm.unfccc.int/Projects/DB/SGS-UKL1333... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 776 | IND | India | Sterling Madhya Pradesh site | WRI1026757 | 6.00 | NaN | NaN | Wind | NaN | NaN | NaN | NaN | Sterling Agro Industries ltd. | CDM | https://cdm.unfccc.int/Projects/DB/SGS-UKL1333... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 784 | IND | India | TAGO | IND0000437 | 4.50 | NaN | NaN | Hydro | NaN | NaN | NaN | 1991.0 | NaN | Central Electricity Authority | http://www.cea.nic.in/ | WRI | NaN | 2019.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 828 | IND | India | Tamilnadu Wind Farm | WRI1026102 | 74.00 | NaN | NaN | Wind | NaN | NaN | NaN | NaN | NaN | CDM | https://cdm.unfccc.int/filestorage/2/F/R/2FRTL... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 884 | IND | India | VRL Wind Farm | WRI1026103 | 42.50 | NaN | NaN | Wind | NaN | NaN | NaN | NaN | NaN | CDM | https://cdm.unfccc.int/Projects/DB/SGS-UKL1225... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
df[df['name']=='Abhijeet']
| country | country_long | name | gppd_idnr | capacity_mw | latitude | longitude | primary_fuel | other_fuel1 | other_fuel2 | other_fuel3 | commissioning_year | owner | source | url | geolocation_source | wepp_id | year_of_capacity_data | generation_gwh_2013 | generation_gwh_2014 | generation_gwh_2015 | generation_gwh_2016 | generation_gwh_2017 | generation_gwh_2018 | generation_gwh_2019 | generation_data_source | estimated_generation_gwh | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 25 | IND | India | Abhijeet | WRI1020238 | 50.0 | NaN | NaN | Solar | NaN | NaN | NaN | 2015.0 | Ener-t International Ltd. | National Renewable Energy Laboratory | http://www.nrel.gov/csp/solarpaces/project_det... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
df.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| capacity_mw | 907.0 | 326.223755 | 590.085456 | 0.0000 | 16.725000 | 59.200000 | 385.250000 | 4760.000 |
| latitude | 861.0 | 21.197918 | 6.239612 | 8.1689 | 16.773900 | 21.780000 | 25.512400 | 34.649 |
| longitude | 861.0 | 77.464907 | 4.939316 | 68.6447 | 74.256200 | 76.719500 | 79.440800 | 95.408 |
| other_fuel3 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| commissioning_year | 527.0 | 1997.091082 | 17.082868 | 1927.0000 | 1988.000000 | 2001.000000 | 2012.000000 | 2018.000 |
| wepp_id | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| year_of_capacity_data | 519.0 | 2019.000000 | 0.000000 | 2019.0000 | 2019.000000 | 2019.000000 | 2019.000000 | 2019.000 |
| generation_gwh_2013 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| generation_gwh_2014 | 398.0 | 2431.823590 | 4026.440035 | 0.0000 | 223.557672 | 801.123775 | 3035.306250 | 28127.000 |
| generation_gwh_2015 | 422.0 | 2428.226946 | 4194.596959 | 0.0000 | 176.381062 | 711.181225 | 3084.121250 | 30539.000 |
| generation_gwh_2016 | 434.0 | 2467.936859 | 4162.884308 | 0.0000 | 188.285252 | 737.205450 | 3282.861313 | 30015.000 |
| generation_gwh_2017 | 440.0 | 2547.759305 | 4196.991169 | 0.0000 | 177.874930 | 817.977250 | 3275.690475 | 35116.000 |
| generation_gwh_2018 | 448.0 | 2600.804099 | 4314.880456 | 0.0000 | 193.378250 | 751.644375 | 3143.535900 | 35136.000 |
| generation_gwh_2019 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| estimated_generation_gwh | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
df.columns
Index(['country', 'country_long', 'name', 'gppd_idnr', 'capacity_mw',
'latitude', 'longitude', 'primary_fuel', 'other_fuel1', 'other_fuel2',
'other_fuel3', 'commissioning_year', 'owner', 'source', 'url',
'geolocation_source', 'wepp_id', 'year_of_capacity_data',
'generation_gwh_2013', 'generation_gwh_2014', 'generation_gwh_2015',
'generation_gwh_2016', 'generation_gwh_2017', 'generation_gwh_2018',
'generation_gwh_2019', 'generation_data_source',
'estimated_generation_gwh'],
dtype='object')
df.isnull().sum()
country 0 country_long 0 name 0 gppd_idnr 0 capacity_mw 0 latitude 46 longitude 46 primary_fuel 0 other_fuel1 709 other_fuel2 906 other_fuel3 907 commissioning_year 380 owner 565 source 0 url 0 geolocation_source 19 wepp_id 907 year_of_capacity_data 388 generation_gwh_2013 907 generation_gwh_2014 509 generation_gwh_2015 485 generation_gwh_2016 473 generation_gwh_2017 467 generation_gwh_2018 459 generation_gwh_2019 907 generation_data_source 458 estimated_generation_gwh 907 dtype: int64
# % of data missing
round((df.isnull().sum()/df.shape[0])*100,2).sort_values(ascending=False)
estimated_generation_gwh 100.00 other_fuel3 100.00 wepp_id 100.00 generation_gwh_2013 100.00 generation_gwh_2019 100.00 other_fuel2 99.89 other_fuel1 78.17 owner 62.29 generation_gwh_2014 56.12 generation_gwh_2015 53.47 generation_gwh_2016 52.15 generation_gwh_2017 51.49 generation_gwh_2018 50.61 generation_data_source 50.50 year_of_capacity_data 42.78 commissioning_year 41.90 latitude 5.07 longitude 5.07 geolocation_source 2.09 country 0.00 url 0.00 country_long 0.00 primary_fuel 0.00 capacity_mw 0.00 gppd_idnr 0.00 name 0.00 source 0.00 dtype: float64
plt.figure(figsize=(15,10))
sns.heatmap(df.isnull(),annot=True)
<AxesSubplot:>
#It would be unnecessary to use these features because % columns contain 100% missing data. It would be better to drop them.
#Other_fuel2 has 99.89% missing values, making it unnecessary to use these features. It is therefore preferable to remove them.
#other_fuel1 has 78.17% missing data, making it unnecessary to use this feature. It is therefore preferable to remove it.
df['other_fuel2'].unique()
array([nan, 'Oil'], dtype=object)
df['other_fuel1'].unique()
array([nan, 'Oil', 'Cogeneration', 'Gas'], dtype=object)
df['other_fuel1'].mode()
0 Oil Name: other_fuel1, dtype: object
df['other_fuel1'].value_counts()
Oil 195 Gas 2 Cogeneration 1 Name: other_fuel1, dtype: int64
Even if we impute this with a mode that is also OIL, this feature has 195 OIL and will not be beneficial because it only has data of one type. Dropping this, too.
df.drop(['estimated_generation_gwh','other_fuel3','wepp_id','generation_gwh_2013','generation_gwh_2019'], axis=1, inplace= True)
df.shape
(907, 22)
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 907 entries, 0 to 906 Data columns (total 22 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 country 907 non-null object 1 country_long 907 non-null object 2 name 907 non-null object 3 gppd_idnr 907 non-null object 4 capacity_mw 907 non-null float64 5 latitude 861 non-null float64 6 longitude 861 non-null float64 7 primary_fuel 907 non-null object 8 other_fuel1 198 non-null object 9 other_fuel2 1 non-null object 10 commissioning_year 527 non-null float64 11 owner 342 non-null object 12 source 907 non-null object 13 url 907 non-null object 14 geolocation_source 888 non-null object 15 year_of_capacity_data 519 non-null float64 16 generation_gwh_2014 398 non-null float64 17 generation_gwh_2015 422 non-null float64 18 generation_gwh_2016 434 non-null float64 19 generation_gwh_2017 440 non-null float64 20 generation_gwh_2018 448 non-null float64 21 generation_data_source 449 non-null object dtypes: float64(10), object(12) memory usage: 156.0+ KB
df['country'].unique()
array(['IND'], dtype=object)
df['country'].nunique()
1
sns.countplot(df['country'])
<AxesSubplot:xlabel='country', ylabel='count'>
#Because it just provides one piece of information, this column is unnecessary.Dropping this
df['country_long'].unique()
array(['India'], dtype=object)
df['country_long'].nunique()
1
sns.countplot(df['country_long'])
<AxesSubplot:xlabel='country_long', ylabel='count'>
#This column is unnecessary because it only provides one sort of information.We are dropping this
df['geolocation_source'].unique()
array(['National Renewable Energy Laboratory', 'WRI', nan,
'Industry About'], dtype=object)
sns
<module 'seaborn' from 'C:\\Users\\lenovo\\anaconda3\\lib\\site-packages\\seaborn\\__init__.py'>
df['name'].unique()
array(['ACME Solar Tower', 'ADITYA CEMENT WORKS',
'AES Saurashtra Windfarms', 'AGARTALA GT', 'AKALTARA TPP',
'AKRIMOTA LIG', 'ALIYAR', 'ALLAIN DUHANGAN', 'ALMATTI DAM',
'AMAR KANTAK', 'AMARAVATI TPP', 'ANANDPUR SAHIB ST-I&II',
'ANAPARA "C"', 'ANDHRA', 'ANOOPGARH ST I&II', 'ANPARA', 'ANTA GT',
'ANUPPUR', 'ANUPUR TPP', 'ARVINDNAGAR SUGAR', 'ASHOKNAGAR MILL',
'ATHANI SUGAR', 'AURAIYA GT', 'AVANTHA BHANDAR TPP',
'AWARPUR CEMENT PLANT', 'Abhijeet', 'Amarsar Solar Power Plant',
'Anabura Wind Farm', 'Ananthapur - Meil Solar Power Plant',
'Andrha Lake Wind Farm', 'Anikkadavu Wind Farm',
'Arasinagundi Wind Farm', 'Askandra - DREPL Solar Power Plant',
'Askandra - Electromech Maritech Solar Power Plant',
'Askandra - Finehope Solar Power Plant',
'Askandra - Khaya Solar Power Plant',
'Askandra - Newton Solar Power Plant',
'Askandra - Saidham Solar Power Plant',
'Askandra - Vasavi Solar Power Plant', 'BADARPUR', 'BAGALKOT ICP',
'BAGLIHAR HEP', 'BAGLIHAR HEP-II', 'BAIRA SIUL', 'BAKRESWAR',
'BALCO TPP', 'BALIMELA', 'BALLARPUR BILT POWER', 'BAMANIA CEMENT',
'BANDAKHAR TPP', 'BANDEL', 'BANER', 'BANSAGAR (I)',
'BANSAGAR (II)', 'BANSAGAR (III)', 'BANSAGAR (IV)',
'BANSWARA CEMENT', 'BARADARHA TPP', 'BARAMURA', 'BARAUNI', 'BARGI',
'BARH STPP II', 'BARKHERA TPP', 'BARSINGAR LIGNITE',
'BASIN BRIDGE GT', 'BASPA', 'BASSI', 'BCPL Wind Farm',
'BEL Chikhalhol Wind Farm', 'BELA NAGPUR', 'BELGAUM UGAR',
'BELLAD-BAGEWADI', 'BELLARY DG', 'BELLARY HOTHUR', 'BELLARY JANKI',
'BELLARY TPS', 'BHADRA', 'BHADRAVATI MILL', 'BHAKRA',
'BHANDARA STEEL MILL', 'BHANDARA VARAM', 'BHANDARDHARA',
'BHATGARH', 'BHATSA', 'BHAVNAGAR TPP', 'BHAWANI KATTALAI -II',
'BHAWANI KATTALAI -III', 'BHAWANI KATTALAI BARRAGE',
'BHIGWAN MILL', 'BHILAI TPP', 'BHIRA', 'BHIRA TAIL RACE',
'BHIVPURI', 'BHUSAWAL', 'BILIYAKALAN PLANT', 'BINA REFINERY',
'BINA TPP', 'BINANI CEMENT PLANT', 'BINJKOTE', 'BINWA',
'BIRSINGHPUR', "BOKARO A ''EXP''", 'BOKARO B', 'BONGAIGAON TPP',
'BP Brahmanvel Wind Farm', 'BRAMHAPURAM DG', 'BSES Kerala CCGT',
'BUDGE BUDGE', 'BUDHIL', 'BUTIBORI PLANT', 'BUTIBORI TPP -II',
'Bamani Solar Power Plant', 'Bamanwali Solar Power Plant',
'Bannari Amman Spinning Mills Wind Power Project',
'Bap - GIS 1-2 Solar Power Plant', 'Bap - GIS 3 Solar Power Plant',
'Bap - OPG Solar Power Plant',
'Bap - Punj Lloyd Solar Power Plant',
'Bapodar - Hiraco Solar Power Plant',
'Bapodar - MBEDL Solar Power Plant',
'Bavadi Barsingha - Giriraj Solar Power Plant',
'Bavadi Barsingha - LEPL Solar Power Plant', 'Belagau Wind Farm',
'Belagwi Wind Turbines', 'Belgaum Gadag Wind Farm',
'Belgaum Wind farm', 'Bera Wind Farm',
'Bhadrada Solar Power Plant', 'Bhat Khedi Wind Farm',
'Bhatkota Solar Power Plant', 'Bitta Solar Power Plant',
'Boha Solar Power Plant', 'Bonada Solar Power Plant',
'CHAKABURA TPP', 'CHAMERA II', 'CHAMERA-I', 'CHAMERA-III',
'CHANDA CEMENT WORKS', 'CHANDERIA CEMENT WORKS',
'CHANDERIA SMELTER', 'CHANDRAPUR GUPTA', 'CHANDRAPUR MANIKGHAR',
'CHANDRAPURA', 'CHANDRAPUR_Coal', 'CHANJU-I', 'CHEMBUKADAVU-II',
'CHENANI I to III', 'CHHABRA TPS', 'CHIBRO (YAMUNA)', 'CHILLA',
'CHUTAK', 'CHUZACHEN', 'CORE GREEN SUGAR',
'Chandrapur Solar Power Plant', 'Charanka Solar Power Plant',
'Chhadavada Wind Farm', 'Chingerbandh Solar Power Plant', 'D.P.L.',
'DADRI (NCTPP)', 'DADRI GT', 'DAE RAJASTHAN', 'DAHANU FRESNEL',
'DAMODARAM SANJEEVAIAH', 'DANAPUR BMM', 'DAUND SUGAR',
'DAVANGERE MILL', 'DEHAR', 'DERANG', 'DGEN MEGA CCCP', 'DHAKRANI',
'DHALIPUR', 'DHANU', 'DHARIWAL INF TPP', 'DHAULI GANGA', 'DHOLPUR',
'DHUVARAN CCPP', 'DIGHE WORKS', 'DIKCHU', 'DIMBE', 'DLF',
'DNYANESHWAR MILL', 'DOLVI ISPAT', 'DONKARAYI', 'DOYANG',
'DUDH GANGA', 'DUGGAVATHI MILL', 'DULHASTI', 'DURG SPONGE IRON',
'DURGAPUR', 'DURGAPUR STEEL TPS', 'Dadri Solar Power Plant',
'Dandeli Mill', 'Davangere Wind Farm', 'Deh Solar Power Plant',
'Devgarh Wind Farm', 'Dhabla Sondhya Solar Power Plant',
'Dhanera Solar Power Plant', 'Dhank - Aravali Solar Power Plant',
'Dhursar', 'Diwakar', 'Dugar Solar Power Plant', 'E.G. CANAL',
'ELDARI', 'EMCO Sonurle Wind Farm', 'ESSAR GT IMP.',
'Enercon Babarzar Wind Farm', 'Enercon Chitradurga Wind Farm',
'Enercon Sonoshi Wind Farm', 'Essel Mining Wind Farm',
'FARAKKA STPS', 'F_BAD CCGT',
'Frost International Palankottai Wind Farm', 'G.I.P.C.L. GT',
'GACL Kutch Wind Farm', 'GADARWARA', 'GADEPAN WORKS', 'GAJ',
'GAMA CCPP Module - 1', 'GANDHAR GT', 'GANDHARBAL',
'GANDHI NAGAR', 'GANDHI SAGAR', 'GANGUWAL', 'GAUTAMI CCCP', 'GEPL',
'GFL Gudhepanchgani', 'GHANVI', 'GHAT PRABHA', 'GHATGHAR PSS',
'GHTP (LEH.MOH.)', 'GHUGUS LMEL', 'GIRI BATA', 'GIWPP Telagi',
'GMR WARORA TPP', 'GODAVARI GT', 'GOINDWAL SAHIB TPP',
'GRACE INDUSTRIES', 'GREL CCPP (RAJAMUNDRY)', 'GULBARGA CEMENT',
'GUMTI', 'Gadag Wind Farm', 'Gadna 1 Solar Power Plant',
'Gadna 2 Solar Power Plant', 'Gandhinagar Solar Power Plant',
'Garacharma Solar Power Plant', 'Godawari',
'Godelai Solar Power Plant', 'Gondri Solar Power Plant',
'Gorsar Wind Farm', 'Grewal Solar Power Plant', 'Gujurat Solar On',
'Gulabpura Solar Power Plant', 'Gwal Pahari Solar Power Plant',
'HALDIA', 'HAMPI', 'HAPURI MILL', 'HASDEOBANGO', 'HAVALGA SUGAR',
'HAZIRA CCCP', 'HAZIRA-GSECL', 'HINGOLI MILL', 'HIRAKUD I&II',
'HIRIYUR BIOMASS', 'HZL Gadag Wind Farm',
'HZL Nandurbar Maharashtra', 'HZL Palladum Wind Farm',
'HZL Parewar Wind Farm', 'HZL Samana Wind Farm', 'H_GANJ B',
'Hirewaddatti Wind Farm', 'Horti Solar Power Plant', 'I.B.VALLEY',
'I.P.GT', 'IDAMALAYAR', 'IDUKKI', 'IEPL BELA TPP',
'IL&FS Kaladeh Wind Farm', 'IND BARATH TPP',
'INDIA POWER TPP(HALDIA HIRANMAYE )', 'INDIRA SAGAR',
'INDRA GANDHI STPP', 'ITPCL TPP', 'Ingaleshwar Wind Farm',
'Isanpur Solar Power Plant', 'Itnal Solar Power Plant', 'J.SAGAR',
'JAISALMER GODAWARI', 'JALDHAKA I&II', 'JALLIPPA KAPURDI TPP',
'JAYKAYPURAM PLANT', 'JAYPEE BELA CEMENT', 'JAYPEE REWA CEMENT',
'JAYPEE SIDHI CEMENT', 'JEGURUPADU GT', 'JOG', 'JOJBERA',
'JORETHANG LOOP', 'JSW RATNAGIRI TPP',
'Jafrabad Solar Power Plant', 'Jamuria Solar Power Plant',
'Jangi Wind Farm', 'Jangi village wind farm', 'Jeloo Wind Farm',
'Jodiya Wind Farm', 'K.N. Ram Kaltek Wind Farm', 'KADAMPARI',
'KADANA', 'KADRA', 'KAHALGAON', 'KAIGA', 'KAKATIYA TPP', 'KAKKAD',
'KAKRAPARA', 'KALINADI', 'KALINADI SUPA', 'KALISINDH', 'KALLADA',
'KALLUR GRINDING', 'KAMAL SPONGE', 'KAMALANGA', 'KANHER',
'KARAD MILL', 'KARAIKAL', 'KARBI LANGPI', 'KARCHAM WANGTOO',
'KARGIL', 'KARUPPUR GT', 'KASAIPALLI', 'KASARGODE DG',
'KASHANG INTEGRATED', 'KASHIPUR CCPP', 'KASRAWAD FACTORY',
'KATGHORA TPP', 'KATHALGURI GT', 'KAWAI TPP', 'KAWAS GT',
'KAYAM KULAM GT', 'KCSF SANGUR', 'KHADAVASLA I&II',
'KHAMBERKHERA IPP', 'KHANDONG', 'KHARA', 'KHATIMA', 'KHODRI',
'KHOPOLI', 'KHOPOLI WORKS', 'KISHENGANGA', 'KM DODDI SUGAR',
'KODASALI', 'KODAYAR-I&II', 'KODERMA', 'KOJIKODE DG', 'KOLAGHAT',
'KOLDAM', 'KOLHAPUR HEMARUS', 'KOLHAPUR KAGAL', 'KOLHAPUR KUMBHI',
'KOLLEGAL SUGAR', 'KONASEEMA CCCP', 'KONDAPALLI GT', 'KOPILI',
'KOPPA SUGAR', 'KOPPAL HKMP', 'KORADI', 'KORBA STPS', 'KORBA-EAST',
'KORBA-WEST', 'KOSI', 'KOTA', 'KOTA COMPLEX', 'KOTESHWAR', 'KOTLA',
'KOVILKALAPPAL', 'KOYNA COMPLEX', 'KS Oils Chinnakuyili Wind Farm',
'KUDGI', 'KULHAL', 'KUNDAH I-V', 'KUNDANKULAM', 'KUNDARGI MILL',
'KUNDARKI TPP', 'KUTCH LIG.', 'KUTHUNGAL', 'KUTTALAM GT',
'KUTTIADI', 'KUTTIYADI Add ext', 'KVK Energy Solar Project',
'KYMORE', 'KYREDEMKULAI', 'K_GUDEM', 'K_GUDEM NEW', 'K_KHEDA II',
'Kaladonger Wind Farm', 'Kamuthi Solar Power Plant',
'Karmaria Solar Power Plant', 'Karur Textile Park Wind Farm',
'Kathauti 1 Solar Power Plant', 'Kathauti 2 Solar Power Plant',
'Katol - Citra Solar Power Plant',
'Katol - Sepset Solar Power Plant', 'Khadoda Solar Power Plant',
'Khandala Solar Power Plant',
'Khetusar - Northern Solar Power Plant',
'Khetusar - Suryauday Solar Power Plant',
'Khilchipur Solar Power Plant',
'Khirsara - Unity Solar Power Plant',
'Khirsara - Welspun Solar Power Plant', 'Khirwa Solar Power Plant',
'Khori Wind Farm', 'Kohinoor Wind Farm',
'Komal West Solar Power Plant', 'Kosa Wind Farm',
'Kottamangalam Wind Farm', 'Kottathara Village Wind Farm',
'Kudankulam', 'LAKHERI CEMENT PLANT', 'LAKWA GT', 'LALITPUR TPP',
'LARA', 'LARGI', 'LEIMAKHONG DG', 'LIGANAMAKKI', 'LIKIM RO',
'LITL Uthumalai', 'LNJP Bahla Basti Wind Farm', 'LODHIVALI',
'LOKMANGAL SUGAR ETHANOL', 'LOKTAK', 'LOWER JHELUM',
'LOWER JURALA', 'LOWER LAGYAP', 'LOWER METTUR', 'LOWER PERIYAR',
'LOWER SILERU', 'LTML Wind Energy Project in Tamil Nadu',
'LVS POWER DG', 'Lakhewali Solar Power Plant',
'Lakhmirwala Solar Power Plant', 'Lallian Kalan Solar Power Plant',
'Lalpur Wind Farm', 'Landewadi Wind Farm',
'Lathi Solar Power Plant', 'Liberty Oil Satara Wind Farm',
'M.A.P.P.', 'MACHKUND', 'MADHAVAMANTRI', 'MADHIKHEDA',
'MAHADEV PRASAD STPP', 'MAHAN TPP', 'MAHATMA GANDHI TPP',
'MAHATMA SUGAR', 'MAHI BAJAJ I&II', 'MAHUL REFINERY',
'MAIHAR CEMENT PLANT', 'MAITHON', 'MAITHON RB TPP', 'MALANA',
'MALLARPUR', 'MANERI BHALI', 'MANGALORE REFINERY', 'MANGAON CCPP',
'MANGLAM CEMENT', 'MANI DPH', 'MANIKDOH', 'MANIKGARH CEMENT',
'MANIYAR', 'MAQSOODPUR IPP', 'MARATHA CEMENT PLANT', 'MARWA TPP',
'MATATILLA', 'MEJA STPP', 'MEJIA', 'MEJIA TPS EXT',
'MELAMARUTHUR (MUTHIARA)', 'METTUR', 'METTUR DAM',
'METTUR TPS EXT', 'MIHAN TPP', 'MOHAMAD PUR', 'MOHARA',
'MONARCHAK CCPP', 'MORDI PLANT', 'MOUDA STPS', 'MOYAGCHU', 'MOYAR',
'MRMPL Mudari Wind Farm', 'MSPL Wind Farm', 'MUDHOL CEMENT',
'MUKERIAN I -IV', 'MUL GRETA', 'MUMBAI BPCL', 'MUMBAI JINDAL',
'MUMBAI ONGC', 'MUNDRA TPP', 'MUNDRA UMPP', 'MUNIRABAD',
'MUNOLI DISTILLERY', 'MURBAD PLANT', 'MUZAFFARPUR', 'MYNTDU',
'Maloshi Wind Farm', 'Mandrup - Giriraj Solar Power Plant',
'Megha Solar Plant', 'Mirpur Kalan - Alianz Eco Solar Power Plant',
'Mirpur Kalan - Nexgen Solux Solar Power Plant',
'Mithapur Solar Power Plant', 'Mokal Wind Farm', 'Mokla Wind farm',
'Mulshi Solar Power Plant',
'Muradwala Dal Singh Solar Power Plant', 'N.A.P.S',
'NABI NAGAR TPP', 'NAGARJUNA SAGAR', 'NAGARJUNA SAGAR TPD',
'NAGDA GRASIM', 'NAGOTHANE COMPLEX', 'NAGPUR GMT', 'NAGPUR MILL',
'NAGPUR SPONGE IRON', 'NAMRUP GT', 'NARASINGPUR', 'NARAYANPUR',
'NARIMANGLAM', 'NASIK', 'NASIK SINNAR', 'NASIK TPP PH 1',
'NATHPA JHAKRI', 'NAWAPARA', 'NEPANAGAR MILL', 'NEW UMTRU',
'NEYVELI FST EXT', 'NEYVELI ST I', 'NEYVELI ST II',
'NEYVELI TPS EXP -II', 'NEYVELI TPS(Z)', 'NIGRI',
'NIMBAHERA JK CEMENT', 'NIMOO BAZGO', 'NIRGAJANI(Ganga Canal)',
'NITIN PLANT', 'NIWARI TPP', 'NIZAM SAGAR',
'NMDC Anehalu Wind Farm', 'NORTH CHENNAI',
'NORTH CHENNAI EXTENSION', 'NSL Wind Farm', 'NURANANG',
'N_SAGAR LBC', 'N_SAGAR RBC', 'Nagda Hills Wind Energy project',
'Nagercoli Wind Farm', 'Naini Solar Power Plant',
'Nallakonda Wind Farm', 'Nargund Wind Farm',
'Nokh - GGEL Solar Power Plant',
'Nokh - Precision Technik Solar Power Plant',
'Nokha Daiya Solar Power Plant', 'NuPower Tirunelveli Wind Farm',
'OBRA', 'OBRA-A', 'OMKARESHWAR', 'ONGC Jerat Wind Farm',
'ONGC Kutch Wind Farm', 'P.NALLUR CCGT', 'PAGUTHAN', 'PAINAMPURAM',
'PAITHON', 'PALATANA CCPP', 'PALLIVASAL', 'PAMPORE GT', 'PANCHET',
'PANIPAT', 'PANNIAR', 'PAPANASAM', 'PARAS', 'PARBATI-III',
'PARBHANI GANGAKHED', 'PARE', 'PARICHA', 'PARLI', 'PARSEN_S VALLE',
'PATALGANGA', 'PATHADI TPS PH -I', 'PATHRI', 'PAWANA',
'PEDDAPURAM CCGT(Samalkot)', 'PENCH', 'PENNA AHOBELAM', 'PERIYAR',
'PIPAVAV CCCP', 'POCHAMPAD', 'PONG', 'PORINGALKUTTU',
'PORINGALKUTTU L', 'PRAGATI CCCP -III', 'PRAGATI CCGT',
'PRIYADARSHNI JURALA', 'PRYAGRAJ (BARA) TPP', 'PULICHINTALA',
'PUNE MILL', 'PURULIA PSS', 'PYKARA', 'PYKARA ALIMATE',
'Panandhro Solar Power Plant', 'Patan Taluka Wind Farm',
'Patnaik Hatti Wind Farm', 'Patnaik Jamnagar Wind Farm',
'Patodi Solar Power Plant', 'Phagi Solar Power Plant',
'Pokhran FVE Solar Power Plant', 'Pokhran THE Solar Power Plant',
'Powerica Jangi Vandjiya', 'Powerica Kutch Wind Farm',
'Powerica Theni Wind Farm', 'Powerica Tirunelveli Wind Farm',
'Pratapgarh Solar Power Plant', 'Precious Solar Power Plant',
'Punjawa - Sikh Wala Solar Power Plant', 'R.A.P.S.', 'R.P.SAGAR',
'RABRIYAWAS CEMENT', 'RADHANAGRI', 'RAGHUNATHPUR TPP PH-I',
'RAICHUR', 'RAICHUR STEEL', 'RAIGARH TPP', 'RAIKHEDA',
'RAJASHREE CEMENT', 'RAJGHAT', 'RAJGHAT (MP)',
'RAJIV GANDHI TPS HISAR', 'RAJPURA DARIBA MINE', 'RAJPURA TPP',
'RAMANAGARAM MILL', 'RAMGANGA', 'RAMGARH GT', 'RAMMAM', 'RAMPUR',
'RANGANADI', 'RANGIT-III', 'RANJANI MILL', 'RANJIT SAGAR',
'RAS CEMENT PLANT', 'RATIJA TPP', 'RATNAGIRI GAS', 'RAYAL SEEMA',
'RELIANCE ENERGY', 'RENGALI', 'RIHAND', 'RIHAND_Hydro',
'RITHALA CCCP', 'RMC MANGROL', 'ROKHIA GT', 'ROPAR',
'ROSA TPP PH - 1', 'RSMNL Wind Farm', 'R_GUNDEM - B',
'R_GUNDEM STPS', 'Rajgarh - NTPC Solar Power Plant',
'Rajgarh - Ujaas Solar Power Plant', 'Raksa Solar Power Plant',
'Ram Tirath Jaga Solar Power Plant',
'Ramagundam Solar Power Plant', 'Rapar Khokhara Solar Power Plant',
'Rasulpur Solar Power Plant', 'Ratedi Mondri Wind Farm',
'Ravra - IOC Solar Power Plant', 'Ravra - NWEPL Solar Power Plant',
'Ravra - SEI Solar Power Plant',
'Ravra - Welspun Solar Power Plant',
'Relaxo Footwear Kui Inda Wind Farm',
'Roaring 40s Wind Farms (Khandke) Private Limited - Phase II',
'Rojhani Solar Power Plant', 'Rucha Soya Wind Farm',
'Ruchi Soya Palsodi Wind Farm', 'S.SAROVAR CHPH', 'S.SAROVAR RBPH',
'SABARIGIRI', 'SAGARDIGHI TPP', 'SAHAKAR SHIROMANI', 'SAINJ',
'SALAL I & II', 'SALAYA TPP', 'SALORA', 'SAMALPATTI DG',
'SAMAYANALLUR DG', 'SAMEERWADI MILL', 'SANGLI KRANTI',
'SANGLI MIRAJ BIOMASS', 'SANGLI VASANTDADA', 'SANJAY BHABA',
'SANJAY GANDHI', 'SANKESHWAR MILL', 'SANTALDIH', 'SARKARPATHY',
'SASAN UMPP', 'SATNA CEMENT PLANT', 'SATPURA', 'SBT SUGAR',
'SEIONI TPP', 'SEMBCORP GAYATRI', 'SENGULAM', 'SERVALAR',
'SEWA-II', 'SEWA-III', 'SHAHPUR', 'SHANAN', 'SHARAVATHY',
'SHARAVATHY TAIL RACE (Gerusupa)', 'SHIMSAPURA', 'SHIROL MILL',
'SHIRPUR', 'SHIVAPURA', 'SHOLAYAR', 'SHOLAYAR I&II',
'SHREE MEGA POWER', 'SHRI MALAPRABHA', 'SHRI SHANKAR',
'SHRIRAM RAYONS PLANT', 'SIIL Jaibhim Wind Farm', 'SIKKA REP.',
'SIMHADRI', 'SIMHAPURI TPP', 'SINGARENI TPP', 'SINGRAULI STPS',
'SINGUR', 'SIPAT STPS', 'SITAPURAM POWER LIMITED', 'SIVASAMUNDRUM',
'SOLAPUR STPP', 'SONAI MILL', 'SONE EAST CANAL', 'SONE WEST CANAL',
'SOUTHERN REPL.', 'SRINAGAR', 'SRISAILAM', 'SRISAILAM LBPH',
'STAKNA', 'STERLITE TPP', 'SUBERNREKHA I&II', 'SUGEN CCCP',
'SURAT GARH', 'SURAT LIG.', 'SURATGARH', 'SURULIYAR', 'SURYA',
'SVPL', 'SWASTIK KORBA', 'SWPPL Madhavakurichi',
'SWPPL Manurpalayam', 'SWPPL Vadakkupanavadali',
'Sadeipali - ASPL Solar Power Plant',
'Sadeipali - REHPL Solar Power Plant', 'Sai Sulphonates Wind Mill',
'Sakri 1A Solar Power Plant', 'Sakri 1B Solar Power Plant',
'Sangatpura Solar Power Plant', 'Satara Wind Farm',
'Shiloj Solar Power Plant', 'Shirsoli Solar Power Plant',
'Shivasamundra Solar Power Plant',
'Shivlakha - Backbone Solar Power Plant',
'Shivlakha - CPEC Solar Power Plant',
'Shivlakha - Konark Solar Power Plant', 'Shree Nashik Wind Farm',
'Shree Sangli Wind Farm', 'Shree Southern Wind Farm',
'Shri Singaji MALWA TPP', 'Singrauli Solar Power Plant',
'Sivaganga Solar Power Plant', 'Sivalaperi Wind Farm',
'Solitaire Solar Power Plant', 'Sonu Wind Farm',
'Sterling Agro Akal Wind Farm', 'Sterling Agro Mokla Wind Farm',
'Sterling Karnataka site', 'Sterling Madhya Pradesh site',
'Sumilon Shikarpur Wind Farm', 'Sumilon Vershamedi Wind Farm',
'Supe Solar Power Plant', 'Surajbari Creek Wind Farm', 'T.B. DAM',
'TADALI', 'TADALI SPONGE IRON', 'TAGO', 'TALCHER', 'TALCHER STPS',
'TALWANDI SABO', 'TAMNAR TPP', 'TANAKPUR', 'TANDA', 'TANIR BAVI',
'TARAPUR', 'TASHIDING', 'TAWA', 'TEESTA I-III', 'TEESTA -V',
'TEESTA LOW DAM -IV', 'TEESTA LOW DAM-III', 'TEESTA-III',
'TEHRI ST -1', 'TENUGHAT', 'THAL WORKS', 'THAMMINAPATNAM TPP',
'THANE PLANT', 'THIROT', 'TILLARI', 'TILLAYA', 'TIRORA TPP',
'TITAGARH', 'TORANGALLU EXT', 'TORANGALLU IMP', 'TORR POWER SAB.',
'TROMBAY', 'TROMBAY GT', 'TROMBAY WORKS', 'TROMBAY_Coal',
'TROMBAY_Oil', 'TUIRIAL', 'TUNGABHADRA SUGAR', 'TUTICORIN',
'TUTICORIN JV', 'TUTICORIN- IND BARATH', 'TVS Kunnur Wind Farm',
'TVS Tirunelveli Wind Farm', 'TVS Virali Wind Farm',
'Tadas wind farm', 'Talcher Kaniha Solar Power Plant',
'Tamilnadu Wind Farm',
'Tata BP - Electronics City Solar Panel Plant',
'Tata Power Gadag Plains Wind Farm', 'Tata Wind Farm',
'Teona Pujarian Solar Power Plant', 'Theni Wind Farm',
'Thimmapuram Wind Farm', 'Tinwari - MB Solar Power Plant',
'Tiruppur', 'Tiruppur 2', 'Torrent Gujurat Wind Farm',
'U.B.D.C. ST.-I& II', 'U.ROGNICHU', 'UCHPINDA TPP', 'UDUPI TPP',
'UJJAINI', 'UKAI LBC', 'UKAI_Coal', 'UKAI_Hydro', 'UMIAM I II &IV',
'UMTRU (NEW)', 'UNCHAHAR', 'UNO SUGEN', 'UPL Dharapuram Wind Farm',
'UPPAR INDRAVATI', 'UPPER KOLAB', 'UPPER SILERU I&II',
'UPPER SINDH I& II', 'URAN GT', 'URAN ONGC', 'URI', 'URI -II',
'URUMI', 'UTRAN CCCP EXT', 'UTRAULA TPP', 'Udumalpet Wind Farm',
'VAIGAI DAM', 'VAITARNA', 'VALANTHARVI GT', 'VALLUR ntpc/ntecl',
'VALUTHUR GT', 'VARAHI', 'VARDHAM BUDNI', 'VARDHAM SATLAPUR',
'VASAVADATTA CEMENT', 'VEMAGIRI CCCP', 'VENUNAGAR SUGAR',
'VIJAIPUR WORKS', 'VIJAYWADA', 'VIJAYWADA TPP-IV', 'VIJESWARAM GT',
'VIKRAM CEMENT', 'VINDH_CHAL STPS', 'VIR', 'VISHNU PRAYAG',
'VIZAG TPP', 'VRL Wind Farm', 'Vaayu Ashti Wind Farm',
'Vankuswade Wind Farm', 'Vastan Solar Power Plant',
'WADI CEMENT PLANT', 'WANAKBORI',
'WARDHA WARORA(Sai Wardha Power)', 'WARDHA WORKS', 'WARNA',
'WY.CANAL A -D', 'Wadgam Solar Power Plant',
'Warora Solar Power Plant', 'Weizzmann Forex Idukki Wind Farm',
'Wind power project by Riddhi Siddhi Gluco Biols Limited (RSGBL)',
'Wind power project in Rajasthan', 'YAMUNANAGAR TPP',
'YASHWANTRAO MOHITE', 'YELHANKA (DG)', 'YERMARUS TPP',
'Yelesandra Solar Power Plant', 'Yelisirur wind power project',
'ZAWAR MINES', 'iEnergy Theni Wind Farm'], dtype=object)
df['name'].nunique()
907
sns.countplot(df['name'])
<AxesSubplot:xlabel='name', ylabel='count'>
df['other_fuel1'].nunique(dropna=False)
4
df['other_fuel1'].value_counts()
Oil 195 Gas 2 Cogeneration 1 Name: other_fuel1, dtype: int64
df['other_fuel2'].nunique()
1
df['other_fuel2'].unique()
array([nan, 'Oil'], dtype=object)
df['other_fuel2'].value_counts(dropna=False)
NaN 906 Oil 1 Name: other_fuel2, dtype: int64
#Even if we impute this with Mode, the feature will only include one type of data, which is irrelevant for model prediction.
# So dropping this
df.drop(['country','country_long','name','other_fuel1','other_fuel2' ], axis=1,inplace=True)
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 907 entries, 0 to 906 Data columns (total 17 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 gppd_idnr 907 non-null object 1 capacity_mw 907 non-null float64 2 latitude 861 non-null float64 3 longitude 861 non-null float64 4 primary_fuel 907 non-null object 5 commissioning_year 527 non-null float64 6 owner 342 non-null object 7 source 907 non-null object 8 url 907 non-null object 9 geolocation_source 888 non-null object 10 year_of_capacity_data 519 non-null float64 11 generation_gwh_2014 398 non-null float64 12 generation_gwh_2015 422 non-null float64 13 generation_gwh_2016 434 non-null float64 14 generation_gwh_2017 440 non-null float64 15 generation_gwh_2018 448 non-null float64 16 generation_data_source 449 non-null object dtypes: float64(10), object(7) memory usage: 120.6+ KB
df['gppd_idnr'].nunique()
907
df['gppd_idnr']
0 WRI1020239
1 WRI1019881
2 WRI1026669
3 IND0000001
4 IND0000002
...
902 IND0000513
903 WRI1026222
904 WRI1026776
905 WRI1019901
906 WRI1026761
Name: gppd_idnr, Length: 907, dtype: object
df['primary_fuel'].unique()
array(['Solar', 'Coal', 'Wind', 'Gas', 'Hydro', 'Biomass', 'Oil',
'Nuclear'], dtype=object)
df['owner'].nunique()
280
df['source'].nunique()
191
df['url'].nunique()
304
df['url'].unique()
array(['http://www.nrel.gov/csp/solarpaces/project_detail.cfm/projectID=262',
'http://www.ultratechcement.com/',
'https://cdm.unfccc.int/Projects/DB/DNV-CUK1328700673.83/view',
'http://www.cea.nic.in/', 'http://www.hindustanpowerprojects.com/',
'http://www.ambedkarsugar.com/',
'http://www.mapsofindia.com/whitepages/foods-and-beverages/food-processors-and-manufacturers/ashokssklimited',
'http://www.renukasugars.com/',
'http://www.nrel.gov/csp/solarpaces/project_detail.cfm/projectID=254',
'http://www.navbharatbuildcon.com/windmill--solar-plant.html',
'https://cdm.unfccc.int/Projects/DB/DNV-CUK1216117082.43/view',
'http://www.meil.in/power-projects',
'https://cdm.unfccc.int/Projects/DB/LRQA%20Ltd1348755977.79/view',
'https://cdm.unfccc.int/Projects/DB/BVQI1379744232.36/viewhttps://cdm.unfccc.int/Projects/DB/LRQA%20Ltd1278588174.29/view',
'http://www.lancosolar.com/pdfs/rajasthan-pv-project-details.pdf',
'http://www.icpluttur.com/', 'http://bilt.com/',
'http://www.jkcement.com/', 'http://www.indiacements.co.in/',
'https://cdm.unfccc.int/Projects/DB/DNV-CUK1218300485.48/view',
'https://cdm.unfccc.int/Projects/DB/RINA1416216844.8/view',
'http://www.idealenergysolar.com/', 'http://www.ugarsugar.com/',
'http://www.vssil.co.in/', 'http://hothurindia.com/',
'http://jankicorp.com/', 'http://www.mpm.co.in/',
'http://www.sunflagsteel.com/',
'http://www.bloomberg.com/profiles/companies/4115076Z:IN-varam-bio-energy-pvt-ltd',
'http://www.sangamgroup.com/', 'http://borl.in/',
'http://binaniindustries.com/',
'https://cdm.unfccc.int/filestorage/T/0/C/T0CJ85FSL1WN9VEXID67ZOBMQRGAPH/Revised%20PDD%20-%20Clean.pdf?t=UE58b2pkZGN1fDC4di78ksdJWNuCM4aKL4xu',
'http://www.indoramaindia.com/',
'http://www.ambedkarsugar.com/solar_power.php',
'http://www.aewinfra.com/solar.html',
'https://cdm.unfccc.int/filestorage/S/7/J/S7JBL0QFRHWA1DEZ2O4NUTG5Y6VKMP/PDD%20BASML%20Version%2003.pdf?t=Mmh8b2pxZWR0fDDj8zCpobgUeh2ry3t0r5N5',
'http://www.sembcorp.com/en/media/features?Id=9179',
'https://cdm.unfccc.int/filestorage/Jyh1M',
'http://www.punjlloyd.com/energy/renewables',
'http://www.hiracoindia.com', 'http://www.moserbaersolar.com',
'http://sterlingandwilson.com/giriraj-enterprises-solar-rajasthan',
'http://www.lepl.in',
'https://cdm.unfccc.int/filestorage/7/l/SG76AFZB9EYTJ4321MOWIKQ8PCDNXR.pdf/9684%20PDD.pdf?t=emF8b2pibm5jfDCvkTnPwzrrv5OWSd79JKVI',
'https://cdm.unfccc.int/filestorage/t/h/3ZTDL1JSY8F96X7WM05GQEOBHVCRKU.pdf/PDD_0.85%20MW%20x%202%20Wind?t=OUd8b2pibmtifDA3QWPDKTd9w1VzFuhHzr3p',
'https://cdm.unfccc.int/Projects/DB/DNV-CUK1204705646.68/view',
'https://cdm.unfccc.int/filestorage/K/H/4/KH4609PBJU1SRCMD8W7FNQ2LTZGIO5/ENRE%20PDD%20version%205_0_21_12_2010%20Clean%20Mode.pdf?t=dWt8b2picnVxfDDdBloH5bI-0qA-0I--EtaE',
'https://cdm.unfccc.int/filestorage/z/g/GHX720LOZTBENA6IVS1PJCWF8YMKUD.pdf/2347%20Revised%20PDD.pdf?t=YUh8b2picHN3fDCY1tP_B8ik_E25vGal7UR-',
'http://www.lancosolar.com',
'https://cdm.unfccc.int/filestorage/h/l/W9FUXHOTR0Y5V2KQBIS4MPN17JEZLC.pdf/PDD%20-%2098.7%20MW%20wind%20power%20project%20at%20Ratlam%20in%20Madhya%20Pradesh%20by%20DJ%20Energy%20Private%20Limited.pdf?t=R2x8b2picWpsfDACMivhrOTU62i3hfmcbopg',
'http://www.abelloncleanenergy.com', 'http://www.adanipower.com',
'http://www.madhavcorp.com',
'http://www.rrecl.com/PDF/Details%20of%20Rajasthan.pdf',
'http://www.acclimited.com/', 'http://www.birlacorporation.com/',
'http://www.hzlindia.com/', 'http://guptacoal.com/',
'http://www.manikgarhcement.com/', 'http://www.coregreen.in/',
'http://mahagenco.in',
'https://natgrp.org/charanka-gujarat-solar-park/',
'https://cdm.unfccc.int/Projects/DB/DNV-CUK1304071464.49/view',
'http://www.p3green.com', 'http://www.reliancepower.co.in/',
'http://jobs.cari.co.in/jobs/bmm-ispat-ltd',
'http://www.daundsugar.com/',
'http://biomass-power.industry-focus.net/karnataka-biomass-projects/148-davangere-sugar-co-to-set-up-30-mw-biomass-power-project-davangere-sugar-co-to-set-up-30-mw-biomass-power-project.html',
'http://www.mukand.com/', 'http://dssk.co.in/',
'http://www.jsw.in/',
'http://www.tradeindia.com/Seller-5380843-SHAMANUR-SUGARS-LIMITED/',
'https://www.jaibalajigroup.com/power.html',
'http://www.ntpc.co.in',
'http://www.westcoastpaper.com/infrastructure/',
'https://cdm.unfccc.int/Projects/DB/DNV-CUK1142448670.58/view',
'https://natgrp.files.wordpress.com/solar-power-rajasthan-upto-march-2013.pdf',
'https://cdm.unfccc.int/filestorage/Y/I/R/YIR8M4NQJKLW63H1OZCGBTSA729PU5/37.5_MW_PDD_-_37.5_MW_Wind_power_project_at_Pratapgarh%2C_Rajasthan?t=WTF8b2pkZHAyfDApG31asZJ6b9TuTgaPGJDE',
'http://ujaas.com', 'http://www.firstsolarind.com',
'http://www.aipl.in',
'http://www.nrel.gov/csp/solarpaces/project_detail.cfm/projectID=272',
'http://www.nrel.gov/csp/solarpaces/project_detail.cfm/projectID=258',
'http://www.rrecl.com/PDF/Details%20of20Rajasthan.pdf',
'https://cdm.unfccc.int/Projects/DB/PJR%20CDM1328594375.93/view',
'https://cdm.unfccc.int/Projects/DB/URSCert1371557111.17/view',
'https://cdm.unfccc.int/Projects/DB/DNV-CUK1185356859.49/view',
'https://cdm.unfccc.int/Projects/DB/DNV-CUK1279516994.31/view',
'https://cdm.unfccc.int/filestorage/O/6/L/O6LOWTE60VD7BDDY8ZB8AQBVNQQZCJ/1115%20PDD%20revised.pdf?t=dWd8b2picW9kfDBLXNNinuA2EARAJAJta6Yw',
'https://cdm.unfccc.int/Projects/DB/BVQI1354554757.67/view',
'https://cdm.unfccc.int/Projects/DB/RWTUV1346053136.1/view',
'http://chambalfertilisers.com/',
'https://cdm.unfccc.int/Projects/DB/RWTUV1202913883.06/view',
'http://www.lloyds.in/',
'https://cdm.unfccc.int/Projects/DB/RWTUV1323884772.8/view',
'http://www.graceindustries.com/', 'http://www.orientcement.com/',
'https://cdm.unfccc.int/Projects/DB/BVQI1239021527.94/view',
'http://sunborneenergy.com', 'http://www.pdpu.ac.in',
'http://www.nrel.gov/csp/solarpaces/project_detail.cfm/projectID=247',
'http://mnre.gov.in/file-manager/UserFiles/State%20wise%20commissioning%20status%20of%20grid%20connected%20solar%20power%20projects/Rajasthan.pdf',
'http://energy.rajasthan.gov.in/content/dam/raj/energy/common/Details%20of%20commissioned%20Solar%20Projects%20.pdf',
'https://cdm.unfccc.int/filestorage/d/d/56KO8FQND07Y3H12PBALXWG9MRC4JV.pdf/PDD_-_GMDC_V05.pdf?t=bGh8b2pkZDEwfDAxL3Mrj_s1ZpnXL5ljSdCb',
'https://www.google.com/maps/d/embed?mid=1bXiXRcLRnBCUfpkHM2J5EVryhG4&ll=31.06035315253383%2C35.02284616406246&z=8',
'http://www.nrel.gov/csp/solarpaces/project_detail.cfm/projectID=263',
'http://www.rrecl.com/PDF/Details%0Rajasthan.pdf',
'http://www.mnre.gov.in',
'http://www.shamrajenercon.com/cer16.htm',
'http://www.purnasakhar.com/', 'http://rkmpowergen.in/',
'https://cdm.unfccc.int/Projects/DB/BVQI1208874936.63/view',
'https://cdm.unfccc.int/Projects/DB/DNV-CUK1351776938.73/view',
'https://cdm.unfccc.int/Projects/DB/DNV-CUK1352807242.64/view',
'https://cdm.unfccc.int/filestorage/x/_/BYO3D0NJE4W2ZULSAXTQC61IM5HV9R.pdf/7873-PDD-25%20Oct%2012.pdf?t=ZGh8b2pkZ3A5fDB5_GTLkYcWdSL2xW6rf1F3',
'https://cdm.unfccc.int/Projects/DB/BVQI1211956663.14/view',
'https://cdm.unfccc.int/filestorage/s/x/8LZ62R4VIOK570F3AWSBNTUECYJ9XD.pdf/3_Revsied_PDD_JAL_2013_01_08_Clean_mode.pdf?t=VkZ8b2pkY2xwfDD7-5fVv-HnyeV-IOsi3cee',
'http://www.esselgroup.com',
'https://cdm.unfccc.int/Projects/DB/LRQA%20Ltd1346322352.66/view',
'https://cdm.unfccc.int/Projects/DB/DNV-CUK1351166669.28/view',
'http://harshaengineers.com', 'http://karnatakapower.com',
'http://hiragroup.com/companies/godawari-green-energy-limited/',
'http://www.jklakshmicement.com/',
'http://www.jalindia.com/subsidiaries.html',
'http://www.jinkosolar.com/press_detail_568.html?lan=en',
'http://wbpdcl.co.in',
'https://cdm.unfccc.int/Projects/DB/RWTUV1342443620.03/view',
'https://cdm.unfccc.int/Projects/DB/RWTUV1279520653.8/view',
'https://cdm.unfccc.int/Projects/DB/BVQI1312546277.77/view',
'https://cdm.unfccc.int/Projects/DB/BVQI1351922842.98/view',
'http://www.chettinad.com/', 'http://www.kjsahluwaliagroup.com/',
'http://www.kcpsugar.com/index.html', 'http://maraloverseas.com/',
'http://www.gmsugars.com/', 'http://www.uttamgalva.com/',
'http://www.chamundeswarisugars.in/',
'http://www.hemarus.co.in/facilities.html',
'http://www.tradeindia.com/Seller-6835496-Shri-Dudhganga-Vedganga-SSK-Ltd-/',
'http://www.kumbhisugar.com/', 'http://www.bannari.com/sugar.html',
'http://www.nslsugars.com/',
'http://www.tradeindia.com/Seller-2020295-HARE-KRISHNA-METALLICS-PVT-LTD-/',
'http://www.dcmshriram.com/',
'https://cdm.unfccc.int/Projects/DB/SGS-UKL1300103078.33/view',
'http://www.gemgroup.in/gem-sugar.html',
'http://www.nrel.gov/csp/solarpaces/project_detail.cfm/projectID=260',
'https://cdm.unfccc.int/Projects/DB/RWTUV1356681143.52/view',
'https://cdm.unfccc.int/Projects/DB/RWTUV1297334588.3/view',
'http://www.azurepower.com',
'http://www.mahaurja.com/PDF/PG2_GridConnSPPCommissioned.pdf',
'http://www.solairedirect.com', 'http://www.acme.in',
'http://www.backboneworld.com', 'http://www.welspunrenewables.com',
'http://www.rrecl.com/PDF/Details%20of%Rajasthan.pdf',
'https://cdm.unfccc.int/filestorage/L/V/Q/LVQ9ANSCJX8164E5H7PWYIB3O0RZ2U/Revised%20PDD%20Version%2006%20dated%205th%20June%202012%20-%20Clean%20mode.pdf?t=akx8b2pxZm9ufDC6M3RsDeQKejKPRFouqWsR',
'https://cdm.unfccc.int/Projects/DB/BVQI1302691944.71/view',
'http://www.tatapowersolar.com',
'https://cdm.unfccc.int/filestorage/w/m/64TXH0Y1V9ZCISBKO3F758PEQNUJDR.pdf/PDD__V-2_5_19_10_2012.pdf?t=akh8b2pkZTFtfDAP0pu4sjZSao0P-GV-Qzqn',
'https://cdm.unfccc.int/filestorage/a/1/0JPNG5S4A821KD6IUHWBOEYRVFTL3X.pdf/ALWE%20PDD_Ver%206%20_06.03.2013_%20Clean%20.pdf?t=dlV8b2picjMyfDAQ_TbHluMZz9jMHYOK8XbU',
'https://cdm.unfccc.int/Projects/DB/BVQI1315830379.2/view',
'https://www.iaea.org/PRIS/CountryStatistics/ReactorDetails.aspx?current=853',
'https://cdm.unfccc.int/Projects/DB/SGS-UKL1200600517.28/view',
'https://cdm.unfccc.int/Projects/DB/KBS_Cert1356901378.31/view',
'http://www.tatapower.com/', 'http://lokmangal.com/site/agro.html',
'https://cdm.unfccc.int/Projects/DB/RINA1359562234.41/view',
'http://www.peda.gov.in/main/SPVPowerProjects.html',
'http://esselinfraprojects.com',
'https://cdm.unfccc.int/Projects/DB/LRQA%20Ltd1354282051.48/view',
'https://cdm.unfccc.int/Projects/DB/LRQA%20Ltd1335362700.21/view',
'http://www.lancogroup.com',
'https://cdm.unfccc.int/Projects/DB/Applus1436171071.3/view',
'https://www.zaubacorp.com/company/MAHATMA-SUGAR-POWER-LIMITED/U15421MH1997PLC111855',
'https://bharatpetroleum.com/', 'http://www.maiharcement.co.in/',
'https://mrpl.co.in/', 'http://www.mangalamcement.com/',
'http://www.ambujacement.com/', 'http://rswm.in/',
'https://cdm.unfccc.int/Projects/DB/LRQA%20Ltd1278588174.29/view',
'https://cdm.unfccc.int/filestorage/I/M/8/IM82KOV79XD6W1FSRY0APUBQT5HNG3/PDD%20Version%203.1?t=eXJ8b2pkY3A3fDDe6Zq2yZtRMfXDrbXuYkLx',
'http://www.jkcement.com/power-plants',
'http://www.gretagroup.com/', 'http://www.ongcindia.com/',
'http://www.technocraftgroup.com/',
'https://cdm.unfccc.int/filestorage/A/R/Q/ARQ8TKW54JBLF1U3E6OXVHCDSN9M70/PDD-CEM-0792-27-09.pdf?t=V0d8b2pibmNpfDBisKvrZGRCvGuhGr5W8zZF',
'http://www.nrel.gov/csp/solarpaces/project_detail.cfm/projectID=264',
'https://cdm.unfccc.int/filestorage/o/t/LOE8KY3952J1UZAFN6CHMPQBVD7RS0.pdf/PDD_RSMML.pdf?t=QUZ8b2pkZzRzfDAgGCgKw2iEeb78bqr1DaV0',
'https://cdm.unfccc.int/filestorage/_/r/F932EYQBPVM7IXZ4KWRUG5HT6S0ACL.pdf/7895-PDD-26%20Oct%2012.pdf?t=aEJ8b2picjhvfDAyiJ_x---be015MnlJwvjn',
'https://www.solairedirect.com', 'http://www.grasim.com/',
'http://www.ril.com/OurBusinesses/Petrochemicals.aspx',
'http://purtigroup.com/gmt-mining.html', 'http://purtigroup.com/',
'http://www.topworthgroup.com/',
'http://www.orientgreenpower.com/', 'http://www.rattanindia.com/',
'http://www.epw.in/journal/1962/42/chair-uncategorised/national-newsprint-and-paper-mills-ltd-nepanagar-mp.html',
'http://www.moneycontrol.com/india/stockpricequote/cement-major/jkcement/JKC03',
'http://www.nitinspinners.com/',
'https://cdm.unfccc.int/Projects/DB/RWTUV1323884913.69/view',
'https://cdm.unfccc.int/Projects/DB/DNV-CUK1173772302.89/view',
'https://cdm.unfccc.int/Projects/DB/RWTUV1135356510.37/view',
'https://cdm.unfccc.int/filestorage/5/D/0/5D0I7GCWXZ1VFNS6ROQKHTMYL8JA39/Nagercoil%20PDD%20Corrected_c.pdf?t=bkl8b2pkZGlofDAiW_GZYG7HXm3tF_Ie-IsH',
'http://www.emcpower.com',
'https://cdm.unfccc.int/Projects/DB/LRQA%20Ltd1355495522.4/view',
'https://cdm.unfccc.int/Projects/DB/PJR%20CDM1353582075.03/view',
'http://hiragroup.com',
'https://cdm.unfccc.int/filestorage/3/x/NUSA0ZGPJRT6YXIMEVOB74WDHQ5KF8.pdf/Final%20PDD.pdf?t=QzV8b2picHExfDDzqlVY2T6pSjzHeaKJVjyC',
'https://cdm.unfccc.int/filestorage/L/M/Z/LMZG56NYST19Q7IHD4EPXJRF2VB03W/PDD.pdf?t=R1p8b2picTYxfDApzXyA-gGP8NbbOgKvi68C',
'https://cdm.unfccc.int/filestorage/F/X/W/FXWNHT2DU5RV3BGL7YA8ZKPI46OS9E/2856_PDD_clean.pdf?t=RGp8b2picmp5fDB2P7QGVKqmOvLt7pNBaIqW',
'http://www.gangakhedicpp.com/', 'http://www.ril.com/',
'http://www.sinarmas.com/en/business-units/',
'http://www.gmdcltd.com',
'https://cdm.unfccc.int/Projects/DB/SIRIM1355276782.28/view',
'https://cdm.unfccc.int/Projects/DB/RWTUV1306214743.43/view',
'https://cdm.unfccc.int/Projects/DB/RWTUV1288029478.94/view',
'http://www.reliancepower.co.in',
'https://cdm.unfccc.int/Projects/DB/RWTUV1411994965.49/view',
'https://cdm.unfccc.int/Projects/DB/LRQA%20Ltd1270819651.34/view',
'https://cdm.unfccc.int/Projects/DB/LRQA%20Ltd1300097036.88/view',
'https://cdm.unfccc.int/Projects/DB/LRQA%20Ltd1264590823.08/view',
'http://icmlindia.com', 'http://www.suranaind.com/',
'http://www.mapsofindia.com/whitepages/foods-and-beverages/food-processors-and-manufacturers/sprsugars',
'http://naturalsugar.in/', 'http://www.shreecement.in/',
'https://cdm.unfccc.int/Projects/DB/BVQI1201770524.09/view',
'http://astonfield.com', 'http://taxusgroup.com',
'https://cdm.unfccc.int/Projects/DB/RINA1354700454.05/view',
'https://www.iocl.com',
'http://sterlingandwilson.com/northwest-energy-pvt-ltd',
'http://ifcext.ifc.org/ifcext/91',
'https://cdm.unfccc.int/Projects/DB/SGS-UKL1316100537.64/view',
'https://cdm.unfccc.int/Projects/DB/DNV-CUK1263981578.63/view',
'https://cdm.unfccc.int/filestorage/z/4/H6BE40Q3YPZ7ULD8OWFR1AIT5V2SXM.pdf/8592%20PDD.pdf?t=Sjd8b2pkZHV0fDDHMHx5kl40-cCl4qrFlA08',
'https://cdm.unfccc.int/Projects/DB/RWTUV1285157251.43/view',
'http://www.sitsonindia.com/pdf/EPC%20power%20projects.pdf',
'http://www.somaiya.com/',
'http://food.industry-focus.net/sugar/379-kranti-ssk-to-expand-its-sugar-mill-in-kundal-village-kranti-ssk-to-expand-its-sugar-mill-in-kundal-village.html',
'https://cdm.unfccc.int/Projects/redirector?ref=4063',
'http://viainfotech.biz/Biomass/theme5/document/green_market/REC-project-list.pdf',
'http://www.belgaum.nic.in/english/Heerasugarfactory.html',
'http://www.thoratsugar.com/', 'http://dattasugar.co.in/',
'http://www.mapsofindia.com/whitepages/foods-and-beverages/wine-and-beer/shrimassk',
'http://www.dcmsr.com/',
'https://cdm.unfccc.int/Projects/DB/LRQA%20Ltd1340102581.62/view',
'https://cdm.unfccc.int/Projects/DB/BVQI1351312018.91/view',
'http://www.cdmindia.gov.in/project_details_view.php?id=1511',
'http://rehpl.co.in',
'https://cdm.unfccc.int/Projects/DB/DNV-CUK1318484242.75/view',
'https://cdm.unfccc.int/filestorage/0/5/A/05AYCZFT4LWMHD216PU9SV7XO8BENR/PDD%20Version%201.2?t=U2Z8b2picWJsfDAiEl4MxIcSh1urnyFbvIy_',
'http://www.monosteel.in',
'http://www.jains.com/Solar/Renewable%20energy.htm',
'http://www.madhavgrp.co.in/SJ_solar.html',
'http://www.konarkgroup.co.in',
'https://cdm.unfccc.int/filestorage/D/O/R/DORZF791H8K4NWACPQ5IMTUYLG2VXE/CDM_PDD_v6.pdf?t=WGN8b2sxbDRnfDDF8h6MT3w9DKjkxFa903Mt',
'http://www.moserbaerprojects.com',
'https://cdm.unfccc.int/Projects/DB/RWTUV1352192814.76/view',
'https://cdm.unfccc.int/Projects/DB/LRQA%20Ltd1321971513.67/view',
'https://cdm.unfccc.int/Projects/DB/SGS-UKL1333450990.34/view',
'https://cdm.unfccc.int/Projects/DB/KBS_Cert1356901324.44/view',
'http://www.cloversolar.com',
'https://cdm.unfccc.int/filestorage/M/7/A/M7ACU21YQXKW5HF8R03TPIJZENO4B9/PDD_30MW_Gujarat_MSPL%20Limited_rev.pdf?t=NlZ8b2pkZ2FofDBI3_hXb8zCdgFEZVKDwM4F',
'http://cescnewinitiatives.com/contents/view/DHARIWALINFRASTRUCTURELIMITED/T',
'http://sanvijay.com/gil.html', 'http://www.rcfltd.com/',
'http://www.nocil.com/',
'http://documents.worldbank.org/curated/en/442061468041961880/pdf/multi-page.pdf',
'https://cdm.unfccc.int/Projects/DB/BVQI1331114727.08/view',
'https://cdm.unfccc.int/Projects/DB/BVQI1336676184.92/view',
'https://cdm.unfccc.int/Projects/DB/DNV-CUK1354626555.71/view',
'https://cdm.unfccc.int/Projects/DB/RINA1356126148.87/view',
'https://cdm.unfccc.int/filestorage/2/F/R/2FRTLO45MK87SJZUYCDN9GWHBAEP3X/7647_PDD_-Ver_12%28PRC%29_Clean.pdf?t=MWx8b2picXlqfDDrpXwmh5zMU8K3xoiuspy8',
'http://www.tatabpsolar.com',
'https://cdm.unfccc.int/Projects/DB/RWTUV1352368180.64/view',
'https://cdm.unfccc.int/filestorage/i/o/Z9N7VXYJ5LIE8WRQM1SB3H4FDA6U2K.pdf/PDD%20with%20revised%20MP%20(ver%2010.0)(Clean%20mode).pdf?t=Qmt8b2picnBrfDBPz_lFTjdrigpAyplKqoQb',
'https://cdm.unfccc.int/Projects/DB/SGS-UKL1374760004.52/view',
'https://cdm.unfccc.int/Projects/DB/DNV-CUK1351859167.83/view',
'https://cdm.unfccc.int/Projects/DB/PJR%20CDM1355461708.47/view',
'https://cdm.unfccc.int/Projects/DB/SGS-UKL1353506542.98/view',
'https://cdm.unfccc.int/Projects/DB/RWTUV1351718801.6/view',
'https://cdm.unfccc.int/Projects/DB/DNV-CUK1174563974.61/view',
'https://www.vardhman.com/', 'https://www.kesocorp.com/',
'http://www.nationalfertilizers.com/',
'http://www.adityabirla.com/about/Grey-cement',
'https://cdm.unfccc.int/Projects/DB/SGS-UKL1225104443.35/view',
'https://cdm.unfccc.int/Projects/DB/DNV-CUK1350385874.53/view',
'https://cdm.unfccc.int/Projects/DB/DNV-CUK1172467864.06/view',
'http://www.gipcl.com',
'http://www.gmrgroup.in/energy-gmr-gujarat-solar-power-ltd.aspx',
'https://cdm.unfccc.int/Projects/DB/LRQA%20Ltd1323254431.52/view',
'https://cdm.unfccc.int/Projects/DB/LRQA%20Ltd1356711565.56/view',
'https://cdm.unfccc.int/Projects/DB/DNV-CUK1354785555.66/view',
'https://cdm.unfccc.int/Projects/DB/TUEV-RHEIN1351077305.18/view',
'https://cdm.unfccc.int/Projects/DB/RWTUV1345031355.85/view'],
dtype=object)
#For a model, the URL characteristic is irrelevant. We can drop this now.
df['geolocation_source'].unique()
array(['National Renewable Energy Laboratory', 'WRI', nan,
'Industry About'], dtype=object)
df.drop(['gppd_idnr','url' ], axis=1,inplace=True)
df.shape
(907, 15)
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 907 entries, 0 to 906 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 capacity_mw 907 non-null float64 1 latitude 861 non-null float64 2 longitude 861 non-null float64 3 primary_fuel 907 non-null object 4 commissioning_year 527 non-null float64 5 owner 342 non-null object 6 source 907 non-null object 7 geolocation_source 888 non-null object 8 year_of_capacity_data 519 non-null float64 9 generation_gwh_2014 398 non-null float64 10 generation_gwh_2015 422 non-null float64 11 generation_gwh_2016 434 non-null float64 12 generation_gwh_2017 440 non-null float64 13 generation_gwh_2018 448 non-null float64 14 generation_data_source 449 non-null object dtypes: float64(10), object(5) memory usage: 106.4+ KB
df.head(15)
| capacity_mw | latitude | longitude | primary_fuel | commissioning_year | owner | source | geolocation_source | year_of_capacity_data | generation_gwh_2014 | generation_gwh_2015 | generation_gwh_2016 | generation_gwh_2017 | generation_gwh_2018 | generation_data_source | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2.50 | 28.1839 | 73.2407 | Solar | 2011.0 | Solar Paces | National Renewable Energy Laboratory | National Renewable Energy Laboratory | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | 98.00 | 24.7663 | 74.6090 | Coal | NaN | Ultratech Cement ltd | Ultratech Cement ltd | WRI | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | 39.20 | 21.9038 | 69.3732 | Wind | NaN | AES | CDM | WRI | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | 135.00 | 23.8712 | 91.3602 | Gas | 2004.0 | NaN | Central Electricity Authority | WRI | 2019.0 | 617.789264 | 843.7470 | 886.004428 | 663.774500 | 626.239128 | Central Electricity Authority |
| 4 | 1800.00 | 21.9603 | 82.4091 | Coal | 2015.0 | NaN | Central Electricity Authority | WRI | 2019.0 | 3035.550000 | 5916.3700 | 6243.000000 | 5385.579736 | 7279.000000 | Central Electricity Authority |
| 5 | 250.00 | 23.7689 | 68.6447 | Coal | 2005.0 | NaN | Central Electricity Authority | WRI | 2019.0 | 1153.421000 | 1208.8520 | 1175.765000 | 1147.913000 | 976.655000 | Central Electricity Authority |
| 6 | 60.00 | 10.4547 | 77.0078 | Hydro | 1970.0 | NaN | Central Electricity Authority | WRI | 2019.0 | 157.558250 | 152.1952 | 61.421350 | 89.629600 | 48.327150 | Central Electricity Authority |
| 7 | 192.00 | 32.2258 | 77.2070 | Hydro | 2010.0 | NaN | Central Electricity Authority | WRI | 2019.0 | 674.391100 | 721.3352 | 675.724400 | 679.594950 | 579.318850 | Central Electricity Authority |
| 8 | 290.00 | 16.3300 | 75.8863 | Hydro | 2004.0 | NaN | Central Electricity Authority | WRI | 2019.0 | 480.594950 | 144.4342 | 402.029750 | 439.372100 | 406.377900 | Central Electricity Authority |
| 9 | 210.00 | 23.1642 | 81.6373 | Coal | 2008.0 | NaN | Central Electricity Authority | WRI | 2019.0 | 1887.904000 | 1643.0460 | 1338.093000 | 1563.457000 | 1487.880000 | Central Electricity Authority |
| 10 | 1350.00 | 21.0782 | 77.9009 | Coal | 2014.0 | NaN | Central Electricity Authority | WRI | 2019.0 | 1920.971000 | 5629.6630 | 1701.008000 | 4350.558000 | 3717.154000 | Central Electricity Authority |
| 11 | 134.00 | 31.2717 | 76.4938 | Hydro | 1985.0 | NaN | Central Electricity Authority | WRI | 2019.0 | 614.412500 | 665.1973 | 670.500650 | 644.570950 | 425.641100 | Central Electricity Authority |
| 12 | 1200.00 | 24.2007 | 82.8000 | Coal | 2011.0 | NaN | Central Electricity Authority | WRI | 2019.0 | 7744.000000 | 8076.8105 | 7902.021600 | 7940.740000 | 7873.973000 | Central Electricity Authority |
| 13 | 16.95 | 31.2412 | 77.8769 | Hydro | 1986.0 | NaN | Central Electricity Authority | WRI | 2019.0 | NaN | NaN | NaN | NaN | NaN | NaN |
| 14 | 9.00 | NaN | NaN | Hydro | 1987.0 | NaN | Central Electricity Authority | WRI | 2019.0 | NaN | NaN | NaN | NaN | NaN | NaN |
df['owner'].nunique()
280
df['owner'].unique()
array(['Solar Paces', 'Ultratech Cement ltd', 'AES', nan,
'Hindustan Pvt lt', 'Dr Ssk ltd', 'Ashok Ssk ltd',
'Shree Sugars ltd', 'Ener-t International Ltd.',
'Nav Bharat Buildcon Private Limited', 'MEIL Green Power Limited',
'Madras Cement Limited',
'DDE Renewable Energy Private Limited (DREPL)',
'Electromech Maritech Private Limited',
'Finehope Allied Engineering Private Limited',
'Khaya Solar Projects Private Limited',
'Newton Solar Private Limited', 'Saidham Overseas Private Limited',
'Vasavi Solar Power Private Limited', 'Indian Power ltd',
'Ballarpur Industries ltd', 'Jk Cement ltd', 'India Ltd icl)',
'Bharat Petroleum Corporation Limited', 'Bhilwara Energy Limited',
'Ideal Projects ltd', 'Ugar Works ltd', 'Vishwanath Sugars ltd',
'Hothur Pvt ltd', 'Janki Corp ltd', 'Mysore Mills ltd',
'Sunflag Co ltd', 'Varam Pvt ltd', 'Sangam Spinners ltd',
'Bharat Refinery ltd', 'Binani Industries ltd',
'Indo Synthetics ltd', 'Dr Babasaheb Ambedkar SSK Limited',
'AEW Infratech Private Limited', 'Enercon India Ltd.',
'Green Infra Solar Farms Private Limited (GIS)',
'OPG Energy Private Limited', 'Punj Lloyd Solar Private Limited',
'Hiraco Renewable Energy Private Limited (HREPL)',
'Moser Baer Energy & Development Limited (MBEDL)',
'Giriraj Enterprises', 'LEPL Projects Limited',
'Lanco Solar Energy Private Limited',
'Abellon CleanEnergy Limited', 'Adani Power Limited',
'Madhav Solar Private Limited', 'Rays Power Private Limited',
'Acc Acc ltd', 'Birla Corp ltd', 'Hindustan Zinc ltd',
'Gupta Ltd gepl)', 'Manikghar Cement co', 'Core Fuels ltd',
'Maharashtra State Power Generation Co Ltd (MAHAGENCO)',
'Charanka Solar Park', 'India Oil Corporation Limited',
'Alex Green Energy Private Limited', 'Reliance Power ltd',
'Bmm Pvt ltd', 'Daund Sugar ltd', 'Davangere Co ltd',
'Mukand Mukand ltd', 'Shri Ssk ltd', 'Jsw Steel ltd',
'Shamanur Sugars ltd', 'Jai Industries ltd', 'Government of India',
'West Coast Paper Mills Ltd.', 'Sanjeev Prakashan',
'Ujaas Energy Limited', 'Sand Land Real Estates Pvt Ltd',
'Aravali Infrapower Private Limited (AIPL)', 'Reliance Power',
'Lanco Infratech', 'Sovox Renewables Private Limited',
'EMCO Limited', 'Frost International Limited',
'Gujurat Alkalies and Chemicals Limited', 'Chambal & chem', 'GFL',
'Lloyds Industries ltd',
'Bangalore Electricity Supply Company Limited',
'Grace Industries ltd', 'Orient Cement ltd',
'SunBorne Energy Gujarat',
'Pandit Deendayal Petrolium University (PDPU)',
'National Thermal Power Corporation (NTPC) Limited',
'Godwari Green Energy Limited',
'Zamil New Delhi Infrastructure Private Limited',
'Shree Saibaba Green Power Private Limited', 'Solaer',
'Cargo Solar Power', 'Amrit Energy Private Limited',
'Solar Energy Centre', 'Jawahar Ssk ltd', 'Purna Ssk ltd',
'Rk Pvt ltd', 'Hindustan Zinc Ltd.', 'HZL',
'Essel MP Energy Limited', 'IL&FS Wind Power Limited',
'Fortune Five Hydel Projects Limited', 'Harsha Engineers Limited',
'Karnataka Power Corporation Limited', 'Godawari Energy ltd',
'Jaypee Ltd jccl)', 'Sunkon Energy Private Limited',
'West Bengal Energy Development Corporation Limited (WBEDCL)',
'Energy Infratech Private Limited', 'K. N. Ram',
'Chettinad Corp ltd', 'Kjs Ahluwalia group', 'Ym Ssk ltd',
'Maral Overseas ltd', 'Gm Energy ltd', 'Uttam Steels ltd',
'Sri Sugars ltd', 'Hemarus Technologies ltd', 'Shri Vedganga ssk',
'Kumbhi Ssk ltd', 'Bannari Sugars ltd', 'Nsl Sugars ltd',
'Hare Pvt ltd', 'Dcm & chem', 'KS Oils', 'Gem Sugars ltd',
'KVK Energy Ventures Ltd', 'Bindu Vau Urja Private Limited',
'SunBorne Energy Gujarat One Pvt', 'Karur Textile Park limited',
'Azure Power (Rajasthan) Private Limited',
'Citra Real Estate Limited', 'Sepset Construction Limited',
'Gujarat Urja Vikas Nigam Limited',
'Firestone Trading Private Limited',
'Northern Solaire Prakash Private Limited',
'Suryauday Solaire Prakash Private Limited', 'ACME Solar Energy',
'Unity Power Limited', 'Welspun Urja India Limited',
'Conflux Infratech Private Limited', 'REI Agro Limited',
'Kohinoor Planet Construction Pvt. Ltd',
'Tata Power Solar Systems Limited (TPREL)',
'Loyal Textile Mills Limited', 'LNJ Power Ventures Limited',
'Tata Co ltd', 'Lokmangal Lokmangal group',
'Atma Powers Private Limited', 'Essel Clean Energy Limited',
'IK Energy Private Limited', 'Serum Institute of India Limited',
'Lanco Solar Power Limited', 'Liberty Oil Mills Limited',
'Mahatma Power ltd', 'Bharat Corp ltd', 'Maihar Cement ltd',
'Mangalore & petrochem', 'Manglam Cement ltd',
'Manikgarh Manikgarh cement', 'Ambuja Cements ltd',
'Rswm Rswm ltd', 'Jodhpur Vidyut Vitran Nigam Limited', 'MSPL',
'Greta Energy ltd', 'Ongc Gas corp',
'Technocraft Technocraft group',
'Megha Engineering and Infrastructure',
'Alianz Eco Power Private Limited', 'Nexgen Solux Private Limited',
'Solairedirect Projects India Private Limited',
'Grasim Industries ltd', 'Reliance Reliance petrochemicals',
'Gmt Pvt ltd', 'Purti Sugar ltd', 'Topworth Metals ltd',
'Orient Power ltd', 'Rattanindia Power ltd', 'National And paper',
'Nitin Spinners ltd', 'NMDC', 'Nuziveedu Seeds limited',
'EMC Limited', 'Tadas Wind Energy', 'Bhoruka Power',
'Godawari Green Energy Limited (GGEL)',
'Precision Technik Private Limited',
'Alex Spectrum Radiations Private Limited', 'NuPower',
'Gangakhed Energy ltd', 'Reliance Ltd ril)', 'Sinarmas Paper ltd',
'Gujarat Mineral Development Corporation Limited (GMDC)',
'Tratnagiri Wind Power Projects Private Limited',
'Patnaik Minerals Pvt. Ltd', 'Vivek Pharmachem (India) Limited',
'Rajasthan Renewable Energy Corporation Limited (RREC)',
'Reliance Power Limited', 'Powerica Limited',
'Integrated Coal Mining (ICML)',
'Precious Energy Services Pvt Ltd', 'Azure Urja Private Limited',
'Surana Industries ltd', 'Grasim Cement ltd', 'Spr Pvt ltd',
'Natural Alliend indust', 'Shree Cement ltd',
'Rajasthan State Mines and Minerals Limited',
'National Thermal Power Corporation (NTPC)',
'Astonfield Management Limited',
'Welspun Solar Punjab Private Limited',
'Taxus Infrastructure and Power Projects Pvt Ltd',
'Ganges Enterprises Private Limited',
'Ratedi Wind Power Private Limited',
'Indian Oil Corporation (IOC)',
'Northwest Energy Private Limited (NWEPL)',
'SEI Solar Energy Private Limited',
'Welspun Solar AP Private Limited', 'Relaxo Footwears Limited',
"Roaring 40's Wind Farms Private Limited",
'Rucha Soya Industries Limited', 'Sahakar Shiromani vasantrao',
'Godavari Mills ltd', 'Kranti Ssk ltd', 'Sinewave Pvt ltd',
'Vasantdada Ssk ltd', 'Shri Hiranyakeshi ssk',
'Sahakarmaharshi Bhausaheb thor', 'Datta Ssk ltd',
'Shri Malaprabha ssk', 'Dcm Industries ltd', 'Mula Ssk ltd',
'SWPPL', 'Aftaab Solar Private Limited (ASPL)',
'Raajratna Energy Holdings Private Limited (REHPL)',
'Sai Sulphonates', 'Omega Solar Projects Private Limited',
'Mono Steel (India) Ltd', 'Jain Solar',
'Backbone Enterprises Limited',
'S J Green Park Energy Private Limited',
'Konark Gujarat Private Limited', 'Shree Naman Developers Ltd.',
'Moser Baer Clean Energy Limited (MBCEL)',
'Green Infra Wind Power Projects Limited',
'Solitaire Energies Pvt Ltd', 'Centaur Mercantile Pvt. Ltd.',
'Sterling Agro Industries ltd.', 'Sumilon Industries Limited',
'Clover Solar Private Limited (CSPL)', 'Dhariwal Pvt ltd',
'Mukesh Gupta group', 'Rashtriya & fert', 'Nocil Nocil rubber',
'TVS Energy limited', 'TVS Energy Limited',
'Tadas Wind Energy Private Limited', 'Tata BP Solar India Limited',
'Tata Power', 'Rayala Wind Power Company Private Limited',
'Moser Baer Solar Limited (MBSL)', 'Torrent Power Limited', 'UPL',
'Vardham Vardham group', 'Kesoram Industries ltd',
'National Ltd nfl)', 'Vikram Vikram cement', 'Vaayu Renew',
'Gujarat Industries Power Company Limited (GIPCL)',
'ACME Cleantech Solutions Private Limited',
'Videocon Industries Limited', 'Weizzmann Forex Limited',
'Riddhi Siddhi Gluco Biols Limited',
'Mytrah Energy (India) Limited', 'Yashwantrao Krishna ssk',
'iEnergy Wind Farms'], dtype=object)
df.groupby('owner')['capacity_mw'].mean()
owner
ACME Cleantech Solutions Private Limited 15.0
ACME Solar Energy 25.0
AES 39.2
AEW Infratech Private Limited 1.0
Abellon CleanEnergy Limited 3.0
...
West Coast Paper Mills Ltd. 20.0
Yashwantrao Krishna ssk 16.0
Ym Ssk ltd 16.0
Zamil New Delhi Infrastructure Private Limited 1.0
iEnergy Wind Farms 16.5
Name: capacity_mw, Length: 280, dtype: float64
df['owner'].value_counts(dropna=False)
NaN 565
Jk Cement ltd 4
Acc Acc ltd 4
Sterling Agro Industries ltd. 4
Maharashtra State Power Generation Co Ltd (MAHAGENCO) 3
...
Sunkon Energy Private Limited 1
West Bengal Energy Development Corporation Limited (WBEDCL) 1
Energy Infratech Private Limited 1
K. N. Ram 1
iEnergy Wind Farms 1
Name: owner, Length: 281, dtype: int64
df['source'].nunique()
191
df['geolocation_source'].value_counts(dropna=False)
WRI 765 Industry About 119 NaN 19 National Renewable Energy Laboratory 4 Name: geolocation_source, dtype: int64
df['generation_data_source'].value_counts(dropna=False)
NaN 458 Central Electricity Authority 449 Name: generation_data_source, dtype: int64
df.drop(['owner','source','generation_data_source','geolocation_source'], axis=1,inplace=True)
df.head()
| capacity_mw | latitude | longitude | primary_fuel | commissioning_year | year_of_capacity_data | generation_gwh_2014 | generation_gwh_2015 | generation_gwh_2016 | generation_gwh_2017 | generation_gwh_2018 | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2.5 | 28.1839 | 73.2407 | Solar | 2011.0 | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | 98.0 | 24.7663 | 74.6090 | Coal | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | 39.2 | 21.9038 | 69.3732 | Wind | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | 135.0 | 23.8712 | 91.3602 | Gas | 2004.0 | 2019.0 | 617.789264 | 843.747 | 886.004428 | 663.774500 | 626.239128 |
| 4 | 1800.0 | 21.9603 | 82.4091 | Coal | 2015.0 | 2019.0 | 3035.550000 | 5916.370 | 6243.000000 | 5385.579736 | 7279.000000 |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 907 entries, 0 to 906 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 capacity_mw 907 non-null float64 1 latitude 861 non-null float64 2 longitude 861 non-null float64 3 primary_fuel 907 non-null object 4 commissioning_year 527 non-null float64 5 year_of_capacity_data 519 non-null float64 6 generation_gwh_2014 398 non-null float64 7 generation_gwh_2015 422 non-null float64 8 generation_gwh_2016 434 non-null float64 9 generation_gwh_2017 440 non-null float64 10 generation_gwh_2018 448 non-null float64 dtypes: float64(10), object(1) memory usage: 78.1+ KB
This is a categorization issue, thus we must develop a model to foretell the primary fuel type.
df.isnull().sum()
capacity_mw 0 latitude 46 longitude 46 primary_fuel 0 commissioning_year 380 year_of_capacity_data 388 generation_gwh_2014 509 generation_gwh_2015 485 generation_gwh_2016 473 generation_gwh_2017 467 generation_gwh_2018 459 dtype: int64
#% of data missing
round((df.isnull().sum()/df.shape[0])*100,2)
capacity_mw 0.00 latitude 5.07 longitude 5.07 primary_fuel 0.00 commissioning_year 41.90 year_of_capacity_data 42.78 generation_gwh_2014 56.12 generation_gwh_2015 53.47 generation_gwh_2016 52.15 generation_gwh_2017 51.49 generation_gwh_2018 50.61 dtype: float64
sns.countplot(df['primary_fuel'],order=df['primary_fuel'].value_counts().index)
<AxesSubplot:xlabel='primary_fuel', ylabel='count'>
df['primary_fuel'].value_counts(normalize=True,dropna=False)
Coal 0.284454 Hydro 0.276736 Solar 0.140022 Wind 0.135612 Gas 0.076075 Biomass 0.055127 Oil 0.022051 Nuclear 0.009923 Name: primary_fuel, dtype: float64
df['primary_fuel'].value_counts()
Coal 258 Hydro 251 Solar 127 Wind 123 Gas 69 Biomass 50 Oil 20 Nuclear 9 Name: primary_fuel, dtype: int64
sns.distplot(df['capacity_mw'])
<AxesSubplot:xlabel='capacity_mw', ylabel='Density'>
df['capacity_mw'].describe().T
count 907.000000 mean 326.223755 std 590.085456 min 0.000000 25% 16.725000 50% 59.200000 75% 385.250000 max 4760.000000 Name: capacity_mw, dtype: float64
#This distribution is not normal.
sns.countplot(df['commissioning_year'])
<AxesSubplot:xlabel='commissioning_year', ylabel='count'>
df['commissioning_year'].value_counts(dropna=False)
NaN 380
2015.0 28
2013.0 25
2012.0 23
2016.0 19
...
1946.0 1
1956.0 1
1937.0 1
1949.0 1
1953.0 1
Name: commissioning_year, Length: 74, dtype: int64
df.head(2)
| capacity_mw | latitude | longitude | primary_fuel | commissioning_year | year_of_capacity_data | generation_gwh_2014 | generation_gwh_2015 | generation_gwh_2016 | generation_gwh_2017 | generation_gwh_2018 | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2.5 | 28.1839 | 73.2407 | Solar | 2011.0 | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | 98.0 | 24.7663 | 74.6090 | Coal | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
sns.distplot(df['year_of_capacity_data'])
<AxesSubplot:xlabel='year_of_capacity_data', ylabel='Density'>
df['year_of_capacity_data'].unique()
array([ nan, 2019.])
df['year_of_capacity_data'].value_counts(dropna=False)
2019.0 519 NaN 388 Name: year_of_capacity_data, dtype: int64
df.drop(['year_of_capacity_data'],axis=1,inplace=True)
df.head(2)
| capacity_mw | latitude | longitude | primary_fuel | commissioning_year | generation_gwh_2014 | generation_gwh_2015 | generation_gwh_2016 | generation_gwh_2017 | generation_gwh_2018 | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2.5 | 28.1839 | 73.2407 | Solar | 2011.0 | NaN | NaN | NaN | NaN | NaN |
| 1 | 98.0 | 24.7663 | 74.6090 | Coal | NaN | NaN | NaN | NaN | NaN | NaN |
sns.distplot(df['generation_gwh_2014'])
<AxesSubplot:xlabel='generation_gwh_2014', ylabel='Density'>
sns.distplot(df['generation_gwh_2015'])
<AxesSubplot:xlabel='generation_gwh_2015', ylabel='Density'>
sns.distplot(df['generation_gwh_2016'])
<AxesSubplot:xlabel='generation_gwh_2016', ylabel='Density'>
sns.distplot(df['generation_gwh_2017'])
<AxesSubplot:xlabel='generation_gwh_2017', ylabel='Density'>
sns.distplot(df['generation_gwh_2018'])
<AxesSubplot:xlabel='generation_gwh_2018', ylabel='Density'>
df.head(2)
| capacity_mw | latitude | longitude | primary_fuel | commissioning_year | generation_gwh_2014 | generation_gwh_2015 | generation_gwh_2016 | generation_gwh_2017 | generation_gwh_2018 | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2.5 | 28.1839 | 73.2407 | Solar | 2011.0 | NaN | NaN | NaN | NaN | NaN |
| 1 | 98.0 | 24.7663 | 74.6090 | Coal | NaN | NaN | NaN | NaN | NaN | NaN |
sns.countplot(df['primary_fuel'],hue=df['capacity_mw'])
<AxesSubplot:xlabel='primary_fuel', ylabel='count'>
visual=df.groupby('primary_fuel')['capacity_mw'].mean().reset_index()
visual
| primary_fuel | capacity_mw | |
|---|---|---|
| 0 | Biomass | 20.065200 |
| 1 | Coal | 797.826434 |
| 2 | Gas | 364.818928 |
| 3 | Hydro | 185.026972 |
| 4 | Nuclear | 975.555556 |
| 5 | Oil | 88.942000 |
| 6 | Solar | 21.712598 |
| 7 | Wind | 33.429675 |
df['primary_fuel'].value_counts(normalize=True,dropna=False)
Coal 0.284454 Hydro 0.276736 Solar 0.140022 Wind 0.135612 Gas 0.076075 Biomass 0.055127 Oil 0.022051 Nuclear 0.009923 Name: primary_fuel, dtype: float64
# primary_fuel and Capacity_mw
visual
| primary_fuel | capacity_mw | |
|---|---|---|
| 0 | Biomass | 20.065200 |
| 1 | Coal | 797.826434 |
| 2 | Gas | 364.818928 |
| 3 | Hydro | 185.026972 |
| 4 | Nuclear | 975.555556 |
| 5 | Oil | 88.942000 |
| 6 | Solar | 21.712598 |
| 7 | Wind | 33.429675 |
sns.barplot(visual['primary_fuel'], visual['capacity_mw'])
<AxesSubplot:xlabel='primary_fuel', ylabel='capacity_mw'>
visual.sort_values('capacity_mw',ascending=False)
| primary_fuel | capacity_mw | |
|---|---|---|
| 4 | Nuclear | 975.555556 |
| 1 | Coal | 797.826434 |
| 2 | Gas | 364.818928 |
| 3 | Hydro | 185.026972 |
| 5 | Oil | 88.942000 |
| 7 | Wind | 33.429675 |
| 6 | Solar | 21.712598 |
| 0 | Biomass | 20.065200 |
df.groupby('primary_fuel')['commissioning_year'].count()
primary_fuel Biomass 0 Coal 189 Gas 59 Hydro 251 Nuclear 8 Oil 12 Solar 8 Wind 0 Name: commissioning_year, dtype: int64
df['commissioning_year'].nunique()
73
df['commissioning_year'].unique()
array([2011., nan, 2004., 2015., 2005., 1970., 2010., 2008., 2014.,
1985., 1986., 1987., 2000., 1989., 2017., 1977., 1980., 2003.,
1984., 1976., 1996., 1991., 2002., 2001., 2006., 2007., 1988.,
1973., 2012., 1968., 1964., 2016., 2013., 1962., 1998., 1993.,
1997., 1999., 1994., 1975., 1992., 1972., 1979., 1967., 1995.,
1983., 1981., 2018., 1959., 1961., 1957., 1978., 1963., 1971.,
1955., 2009., 1966., 1965., 1960., 1949., 1952., 1990., 1982.,
1937., 1956., 1946., 1958., 1943., 1954., 1969., 1939., 1927.,
1974., 1953.])
df['commissioning_year'].value_counts()
2015.0 28
2013.0 25
2012.0 23
2016.0 19
2010.0 18
..
1946.0 1
1956.0 1
1937.0 1
1949.0 1
1953.0 1
Name: commissioning_year, Length: 73, dtype: int64
df.head(5)
| capacity_mw | latitude | longitude | primary_fuel | commissioning_year | generation_gwh_2014 | generation_gwh_2015 | generation_gwh_2016 | generation_gwh_2017 | generation_gwh_2018 | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2.5 | 28.1839 | 73.2407 | Solar | 2011.0 | NaN | NaN | NaN | NaN | NaN |
| 1 | 98.0 | 24.7663 | 74.6090 | Coal | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | 39.2 | 21.9038 | 69.3732 | Wind | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | 135.0 | 23.8712 | 91.3602 | Gas | 2004.0 | 617.789264 | 843.747 | 886.004428 | 663.774500 | 626.239128 |
| 4 | 1800.0 | 21.9603 | 82.4091 | Coal | 2015.0 | 3035.550000 | 5916.370 | 6243.000000 | 5385.579736 | 7279.000000 |
plt.figure(figsize=(12,8))
sns.scatterplot(x='generation_gwh_2014',y='capacity_mw',hue='primary_fuel', data=df)
<AxesSubplot:xlabel='generation_gwh_2014', ylabel='capacity_mw'>
df.groupby('primary_fuel')['generation_gwh_2014'].median()
primary_fuel Biomass NaN Coal 3193.101000 Gas 559.010000 Hydro 386.005275 Nuclear 3875.323150 Oil 0.000000 Solar NaN Wind NaN Name: generation_gwh_2014, dtype: float64
plt.figure(figsize=(12,8))
sns.scatterplot(x='generation_gwh_2015',y='capacity_mw',hue='primary_fuel', data=df)
<AxesSubplot:xlabel='generation_gwh_2015', ylabel='capacity_mw'>
df.groupby('primary_fuel')['generation_gwh_2015'].mean()
primary_fuel Biomass NaN Coal 4649.585751 Gas 840.159648 Hydro 709.991238 Nuclear 4234.875000 Oil 21.710086 Solar NaN Wind NaN Name: generation_gwh_2015, dtype: float64
df.groupby('primary_fuel')['generation_gwh_2015']
<pandas.core.groupby.generic.SeriesGroupBy object at 0x00000179AA43AB20>
df[df['primary_fuel']=='Biomass']
| capacity_mw | latitude | longitude | primary_fuel | commissioning_year | generation_gwh_2014 | generation_gwh_2015 | generation_gwh_2016 | generation_gwh_2017 | generation_gwh_2018 | |
|---|---|---|---|---|---|---|---|---|---|---|
| 19 | 10.00 | 18.0845 | 76.1851 | Biomass | NaN | NaN | NaN | NaN | NaN | NaN |
| 20 | 17.00 | 19.5867 | 74.7061 | Biomass | NaN | NaN | NaN | NaN | NaN | NaN |
| 21 | 24.00 | 16.7708 | 74.9191 | Biomass | NaN | NaN | NaN | NaN | NaN | NaN |
| 40 | 28.00 | 16.2804 | 75.2250 | Biomass | NaN | NaN | NaN | NaN | NaN | NaN |
| 70 | 22.80 | 16.6505 | 74.8209 | Biomass | NaN | NaN | NaN | NaN | NaN | NaN |
| 71 | 15.00 | 16.2774 | 74.7111 | Biomass | NaN | NaN | NaN | NaN | NaN | NaN |
| 80 | 10.00 | 21.0813 | 79.7708 | Biomass | NaN | NaN | NaN | NaN | NaN | NaN |
| 94 | 10.00 | 25.2267 | 74.6216 | Biomass | NaN | NaN | NaN | NaN | NaN | NaN |
| 152 | 24.00 | 16.5471 | 77.0825 | Biomass | NaN | NaN | NaN | NaN | NaN | NaN |
| 164 | 30.00 | 18.4271 | 74.6338 | Biomass | NaN | NaN | NaN | NaN | NaN | NaN |
| 165 | 24.00 | 14.3277 | 75.8808 | Biomass | NaN | NaN | NaN | NaN | NaN | NaN |
| 180 | 12.96 | 19.4496 | 75.0382 | Biomass | NaN | NaN | NaN | NaN | NaN | NaN |
| 185 | 22.00 | 14.6284 | 75.8448 | Biomass | NaN | NaN | NaN | NaN | NaN | NaN |
| 255 | 24.00 | 16.6038 | 74.4045 | Biomass | NaN | NaN | NaN | NaN | NaN | NaN |
| 257 | 25.50 | 17.1202 | 76.4004 | Biomass | NaN | NaN | NaN | NaN | NaN | NaN |
| 260 | 18.90 | 19.2829 | 77.1384 | Biomass | NaN | NaN | NaN | NaN | NaN | NaN |
| 262 | 25.00 | 13.9808 | 76.6279 | Biomass | NaN | NaN | NaN | NaN | NaN | NaN |
| 286 | 50.00 | 27.6010 | 72.2242 | Biomass | NaN | NaN | NaN | NaN | NaN | NaN |
| 321 | 16.00 | 17.1657 | 74.2459 | Biomass | NaN | NaN | NaN | NaN | NaN | NaN |
| 337 | 18.00 | 14.7753 | 75.3131 | Biomass | NaN | NaN | NaN | NaN | NaN | NaN |
| 347 | 26.00 | 12.4806 | 77.0411 | Biomass | NaN | NaN | NaN | NaN | NaN | NaN |
| 354 | 20.00 | 16.0327 | 74.4583 | Biomass | NaN | NaN | NaN | NaN | NaN | NaN |
| 355 | 20.00 | 16.5890 | 74.2993 | Biomass | NaN | NaN | NaN | NaN | NaN | NaN |
| 356 | 17.50 | 16.6890 | 74.1201 | Biomass | NaN | NaN | NaN | NaN | NaN | NaN |
| 357 | 16.00 | 12.1056 | 76.7606 | Biomass | NaN | NaN | NaN | NaN | NaN | NaN |
| 361 | 26.00 | 12.7020 | 76.9787 | Biomass | NaN | NaN | NaN | NaN | NaN | NaN |
| 379 | 16.50 | 16.2651 | 75.5302 | Biomass | NaN | NaN | NaN | NaN | NaN | NaN |
| 426 | 31.50 | 17.4462 | 75.7161 | Biomass | NaN | NaN | NaN | NaN | NaN | NaN |
| 450 | 15.00 | 20.8558 | 78.5814 | Biomass | NaN | NaN | NaN | NaN | NaN | NaN |
| 496 | 11.20 | 15.8555 | 75.1002 | Biomass | NaN | NaN | NaN | NaN | NaN | NaN |
| 516 | 10.00 | 21.0582 | 79.4850 | Biomass | NaN | NaN | NaN | NaN | NaN | NaN |
| 517 | 18.90 | 20.8062 | 79.0548 | Biomass | NaN | NaN | NaN | NaN | NaN | NaN |
| 520 | 10.00 | 22.9649 | 78.8132 | Biomass | NaN | NaN | NaN | NaN | NaN | NaN |
| 576 | 30.00 | 18.9057 | 76.7282 | Biomass | NaN | NaN | NaN | NaN | NaN | NaN |
| 633 | 11.00 | 12.7145 | 77.4524 | Biomass | NaN | NaN | NaN | NaN | NaN | NaN |
| 640 | 12.50 | 18.5354 | 76.2450 | Biomass | NaN | NaN | NaN | NaN | NaN | NaN |
| 679 | 18.00 | 17.7080 | 75.1361 | Biomass | NaN | NaN | NaN | NaN | NaN | NaN |
| 686 | 24.00 | 16.3918 | 75.0411 | Biomass | NaN | NaN | NaN | NaN | NaN | NaN |
| 687 | 19.70 | 17.1354 | 74.4272 | Biomass | NaN | NaN | NaN | NaN | NaN | NaN |
| 688 | 10.00 | 16.9171 | 74.4821 | Biomass | NaN | NaN | NaN | NaN | NaN | NaN |
| 689 | 12.50 | 16.8810 | 74.5898 | Biomass | NaN | NaN | NaN | NaN | NaN | NaN |
| 692 | 26.00 | 16.2478 | 74.5003 | Biomass | NaN | NaN | NaN | NaN | NaN | NaN |
| 698 | 16.00 | 19.6125 | 74.1924 | Biomass | NaN | NaN | NaN | NaN | NaN | NaN |
| 710 | 36.00 | 16.7493 | 74.5854 | Biomass | NaN | NaN | NaN | NaN | NaN | NaN |
| 716 | 22.00 | 15.6933 | 74.7162 | Biomass | NaN | NaN | NaN | NaN | NaN | NaN |
| 717 | 20.00 | 17.8676 | 74.8405 | Biomass | NaN | NaN | NaN | NaN | NaN | NaN |
| 730 | 16.00 | 19.3804 | 74.8397 | Biomass | NaN | NaN | NaN | NaN | NaN | NaN |
| 819 | 28.00 | 15.6609 | 76.8756 | Biomass | NaN | NaN | NaN | NaN | NaN | NaN |
| 874 | 19.80 | 17.7382 | 75.3218 | Biomass | NaN | NaN | NaN | NaN | NaN | NaN |
| 900 | 16.00 | 17.1653 | 74.2456 | Biomass | NaN | NaN | NaN | NaN | NaN | NaN |
plt.figure(figsize=(12,8))
sns.scatterplot(x='generation_gwh_2016',y='capacity_mw',hue='primary_fuel', data=df)
<AxesSubplot:xlabel='generation_gwh_2016', ylabel='capacity_mw'>
plt.figure(figsize=(12,8))
sns.scatterplot(x='generation_gwh_2017',y='capacity_mw',hue='primary_fuel', data=df)
<AxesSubplot:xlabel='generation_gwh_2017', ylabel='capacity_mw'>
plt.figure(figsize=(12,8))
sns.scatterplot(x='generation_gwh_2018',y='capacity_mw',hue='primary_fuel', data=df)
<AxesSubplot:xlabel='generation_gwh_2018', ylabel='capacity_mw'>
sns.scatterplot(x='capacity_mw',y='commissioning_year',hue='primary_fuel', data=df)
<AxesSubplot:xlabel='capacity_mw', ylabel='commissioning_year'>
df.corr()
| capacity_mw | latitude | longitude | commissioning_year | generation_gwh_2014 | generation_gwh_2015 | generation_gwh_2016 | generation_gwh_2017 | generation_gwh_2018 | |
|---|---|---|---|---|---|---|---|---|---|
| capacity_mw | 1.000000 | 0.051309 | 0.197129 | 0.304087 | 0.839094 | 0.844352 | 0.870620 | 0.886673 | 0.901918 |
| latitude | 0.051309 | 1.000000 | 0.036362 | 0.129902 | 0.060330 | 0.048610 | 0.041387 | 0.040568 | 0.040476 |
| longitude | 0.197129 | 0.036362 | 1.000000 | 0.121910 | 0.013511 | 0.022760 | 0.050732 | 0.048678 | 0.045599 |
| commissioning_year | 0.304087 | 0.129902 | 0.121910 | 1.000000 | 0.160014 | 0.170363 | 0.193743 | 0.191367 | 0.193694 |
| generation_gwh_2014 | 0.839094 | 0.060330 | 0.013511 | 0.160014 | 1.000000 | 0.961098 | 0.937060 | 0.912561 | 0.902781 |
| generation_gwh_2015 | 0.844352 | 0.048610 | 0.022760 | 0.170363 | 0.961098 | 1.000000 | 0.974918 | 0.940191 | 0.934371 |
| generation_gwh_2016 | 0.870620 | 0.041387 | 0.050732 | 0.193743 | 0.937060 | 0.974918 | 1.000000 | 0.972024 | 0.962073 |
| generation_gwh_2017 | 0.886673 | 0.040568 | 0.048678 | 0.191367 | 0.912561 | 0.940191 | 0.972024 | 1.000000 | 0.985856 |
| generation_gwh_2018 | 0.901918 | 0.040476 | 0.045599 | 0.193694 | 0.902781 | 0.934371 | 0.962073 | 0.985856 | 1.000000 |
plt.figure(figsize=(12,8))
sns.heatmap(df.corr(),annot=True)
<AxesSubplot:>
plt.figure(figsize=(12,8))
sns.pairplot(df)
<seaborn.axisgrid.PairGrid at 0x179aa1bf280>
<Figure size 1200x800 with 0 Axes>
df.groupby('primary_fuel')['generation_gwh_2014'].mean()
primary_fuel Biomass NaN Coal 4737.279310 Gas 756.601441 Hydro 780.429852 Nuclear 4867.556164 Oil 130.881366 Solar NaN Wind NaN Name: generation_gwh_2014, dtype: float64
df.groupby('primary_fuel')['generation_gwh_2015'].mean()
primary_fuel Biomass NaN Coal 4649.585751 Gas 840.159648 Hydro 709.991238 Nuclear 4234.875000 Oil 21.710086 Solar NaN Wind NaN Name: generation_gwh_2015, dtype: float64
df.groupby('primary_fuel')['generation_gwh_2016'].mean()
primary_fuel Biomass NaN Coal 4761.233946 Gas 826.305503 Hydro 699.318079 Nuclear 4272.608750 Oil 4.796871 Solar NaN Wind NaN Name: generation_gwh_2016, dtype: float64
df.groupby('primary_fuel')['generation_gwh_2017'].mean()
primary_fuel Biomass NaN Coal 4870.537285 Gas 881.216187 Hydro 693.884741 Nuclear 4323.415000 Oil 0.167471 Solar NaN Wind NaN Name: generation_gwh_2017, dtype: float64
df.groupby('primary_fuel')['generation_gwh_2018'].mean()
primary_fuel Biomass NaN Coal 5036.420635 Gas 846.658418 Hydro 737.636455 Nuclear 4277.031250 Oil 0.295215 Solar NaN Wind NaN Name: generation_gwh_2018, dtype: float64
the_mean = df.groupby('primary_fuel').mean()
the_mean
| capacity_mw | latitude | longitude | commissioning_year | generation_gwh_2014 | generation_gwh_2015 | generation_gwh_2016 | generation_gwh_2017 | generation_gwh_2018 | |
|---|---|---|---|---|---|---|---|---|---|
| primary_fuel | |||||||||
| Biomass | 20.065200 | 17.460458 | 75.679052 | NaN | NaN | NaN | NaN | NaN | NaN |
| Coal | 797.826434 | 21.657714 | 79.431460 | 2006.021164 | 4737.279310 | 4649.585751 | 4761.233946 | 4870.537285 | 5036.420635 |
| Gas | 364.818928 | 20.050144 | 78.408238 | 2002.830508 | 756.601441 | 840.159648 | 826.305503 | 881.216187 | 846.658418 |
| Hydro | 185.026972 | 22.258483 | 78.846256 | 1988.709163 | 780.429852 | 709.991238 | 699.318079 | 693.884741 | 737.636455 |
| Nuclear | 975.555556 | 18.081478 | 76.124056 | 1994.250000 | 4867.556164 | 4234.875000 | 4272.608750 | 4323.415000 | 4277.031250 |
| Oil | 88.942000 | 17.311847 | 74.833806 | 1994.583333 | 130.881366 | 21.710086 | 4.796871 | 0.167471 | 0.295215 |
| Solar | 21.712598 | 24.095380 | 74.352328 | 2013.375000 | NaN | NaN | NaN | NaN | NaN |
| Wind | 33.429675 | 17.857224 | 74.181553 | NaN | NaN | NaN | NaN | NaN | NaN |
df.plot(x="longitude", y="latitude", kind="scatter",colormap="YlOrRd")
<AxesSubplot:xlabel='longitude', ylabel='latitude'>
visual
| primary_fuel | capacity_mw | |
|---|---|---|
| 0 | Biomass | 20.065200 |
| 1 | Coal | 797.826434 |
| 2 | Gas | 364.818928 |
| 3 | Hydro | 185.026972 |
| 4 | Nuclear | 975.555556 |
| 5 | Oil | 88.942000 |
| 6 | Solar | 21.712598 |
| 7 | Wind | 33.429675 |
data=temp[['longitude','latitude','name']].copy()
data.dropna(inplace=True)
data.isnull().sum()
longitude 0 latitude 0 name 0 dtype: int64
pip install folium
Collecting folium
Downloading folium-0.14.0-py2.py3-none-any.whl (102 kB)
-------------------------------------- 102.3/102.3 kB 1.5 MB/s eta 0:00:00
Requirement already satisfied: requests in c:\users\lenovo\anaconda3\lib\site-packages (from folium) (2.28.1)
Requirement already satisfied: jinja2>=2.9 in c:\users\lenovo\anaconda3\lib\site-packages (from folium) (2.11.3)
Requirement already satisfied: numpy in c:\users\lenovo\anaconda3\lib\site-packages (from folium) (1.21.5)
Collecting branca>=0.6.0
Downloading branca-0.6.0-py3-none-any.whl (24 kB)
Requirement already satisfied: MarkupSafe>=0.23 in c:\users\lenovo\anaconda3\lib\site-packages (from jinja2>=2.9->folium) (2.0.1)
Requirement already satisfied: certifi>=2017.4.17 in c:\users\lenovo\anaconda3\lib\site-packages (from requests->folium) (2022.9.14)
Requirement already satisfied: charset-normalizer<3,>=2 in c:\users\lenovo\anaconda3\lib\site-packages (from requests->folium) (2.0.4)
Requirement already satisfied: urllib3<1.27,>=1.21.1 in c:\users\lenovo\anaconda3\lib\site-packages (from requests->folium) (1.26.11)
Requirement already satisfied: idna<4,>=2.5 in c:\users\lenovo\anaconda3\lib\site-packages (from requests->folium) (3.3)
Installing collected packages: branca, folium
Successfully installed branca-0.6.0 folium-0.14.0
Note: you may need to restart the kernel to use updated packages.
#installing and importing folium
import folium
#The first step is creating a map of the place we want to go.Passing the mean of the latitude and longitude coordinates
#we need to center the map there using the location parameter.
map= folium.Map(location=[data['latitude'].mean(),data['longitude'].mean()],zoom_start=5,control_scale=True)
# Adding markers to the map
for index,location in data.iterrows():
folium.Marker([location['latitude'],location['longitude']],popup=location['name']).add_to(map)
map
df.head()
| capacity_mw | latitude | longitude | primary_fuel | commissioning_year | generation_gwh_2014 | generation_gwh_2015 | generation_gwh_2016 | generation_gwh_2017 | generation_gwh_2018 | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2.5 | 28.1839 | 73.2407 | Solar | 2011.0 | NaN | NaN | NaN | NaN | NaN |
| 1 | 98.0 | 24.7663 | 74.6090 | Coal | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | 39.2 | 21.9038 | 69.3732 | Wind | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | 135.0 | 23.8712 | 91.3602 | Gas | 2004.0 | 617.789264 | 843.747 | 886.004428 | 663.774500 | 626.239128 |
| 4 | 1800.0 | 21.9603 | 82.4091 | Coal | 2015.0 | 3035.550000 | 5916.370 | 6243.000000 | 5385.579736 | 7279.000000 |
the_mean
| capacity_mw | latitude | longitude | commissioning_year | generation_gwh_2014 | generation_gwh_2015 | generation_gwh_2016 | generation_gwh_2017 | generation_gwh_2018 | |
|---|---|---|---|---|---|---|---|---|---|
| primary_fuel | |||||||||
| Biomass | 20.065200 | 17.460458 | 75.679052 | NaN | NaN | NaN | NaN | NaN | NaN |
| Coal | 797.826434 | 21.657714 | 79.431460 | 2006.021164 | 4737.279310 | 4649.585751 | 4761.233946 | 4870.537285 | 5036.420635 |
| Gas | 364.818928 | 20.050144 | 78.408238 | 2002.830508 | 756.601441 | 840.159648 | 826.305503 | 881.216187 | 846.658418 |
| Hydro | 185.026972 | 22.258483 | 78.846256 | 1988.709163 | 780.429852 | 709.991238 | 699.318079 | 693.884741 | 737.636455 |
| Nuclear | 975.555556 | 18.081478 | 76.124056 | 1994.250000 | 4867.556164 | 4234.875000 | 4272.608750 | 4323.415000 | 4277.031250 |
| Oil | 88.942000 | 17.311847 | 74.833806 | 1994.583333 | 130.881366 | 21.710086 | 4.796871 | 0.167471 | 0.295215 |
| Solar | 21.712598 | 24.095380 | 74.352328 | 2013.375000 | NaN | NaN | NaN | NaN | NaN |
| Wind | 33.429675 | 17.857224 | 74.181553 | NaN | NaN | NaN | NaN | NaN | NaN |
df
| capacity_mw | latitude | longitude | primary_fuel | commissioning_year | generation_gwh_2014 | generation_gwh_2015 | generation_gwh_2016 | generation_gwh_2017 | generation_gwh_2018 | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2.5 | 28.1839 | 73.2407 | Solar | 2011.0 | NaN | NaN | NaN | NaN | NaN |
| 1 | 98.0 | 24.7663 | 74.6090 | Coal | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | 39.2 | 21.9038 | 69.3732 | Wind | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | 135.0 | 23.8712 | 91.3602 | Gas | 2004.0 | 617.789264 | 843.747000 | 886.004428 | 663.774500 | 626.239128 |
| 4 | 1800.0 | 21.9603 | 82.4091 | Coal | 2015.0 | 3035.550000 | 5916.370000 | 6243.000000 | 5385.579736 | 7279.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 902 | 1600.0 | 16.2949 | 77.3568 | Coal | 2016.0 | NaN | 0.994875 | 233.596650 | 865.400000 | 686.500000 |
| 903 | 3.0 | 12.8932 | 78.1654 | Solar | NaN | NaN | NaN | NaN | NaN | NaN |
| 904 | 25.5 | 15.2758 | 75.5811 | Wind | NaN | NaN | NaN | NaN | NaN | NaN |
| 905 | 80.0 | 24.3500 | 73.7477 | Coal | NaN | NaN | NaN | NaN | NaN | NaN |
| 906 | 16.5 | 9.9344 | 77.4768 | Wind | NaN | NaN | NaN | NaN | NaN | NaN |
907 rows × 10 columns
df['latitude'].mode()
0 19.0004 1 24.1917 Name: latitude, dtype: float64
df['longitude'].mode()
0 71.6917 Name: longitude, dtype: float64
#Filling in the null values by mode since we can't identify a geolocation relationship with any other column.
df[['latitude','longitude']].mode()
| latitude | longitude | |
|---|---|---|
| 0 | 19.0004 | 71.6917 |
| 1 | 24.1917 | NaN |
df['commissioning_year'].mode()
0 2015.0 Name: commissioning_year, dtype: float64
# Substituting 19.0004 for latitude and 71.6917 for longitude.
df['latitude'].fillna(df['latitude'].mode()[0], inplace=True)
df['longitude'].fillna(df['longitude'].mode()[0], inplace=True)
df['commissioning_year'].fillna(df['commissioning_year'].mode()[0], inplace=True)
df['latitude'].isnull().sum()
0
df['longitude'].isnull().sum()
0
df['commissioning_year'].isnull().sum()
0
sns.boxplot(df['generation_gwh_2014'])
<AxesSubplot:xlabel='generation_gwh_2014'>
# Determining the median power produced in 2014 according to the fuel type.
df.groupby('primary_fuel').mean()
| capacity_mw | latitude | longitude | commissioning_year | generation_gwh_2014 | generation_gwh_2015 | generation_gwh_2016 | generation_gwh_2017 | generation_gwh_2018 | |
|---|---|---|---|---|---|---|---|---|---|
| primary_fuel | |||||||||
| Biomass | 20.065200 | 17.460458 | 75.679052 | 2015.000000 | NaN | NaN | NaN | NaN | NaN |
| Coal | 797.826434 | 21.606216 | 79.281465 | 2008.422481 | 4737.279310 | 4649.585751 | 4761.233946 | 4870.537285 | 5036.420635 |
| Gas | 364.818928 | 20.034930 | 78.310897 | 2004.594203 | 756.601441 | 840.159648 | 826.305503 | 881.216187 | 846.658418 |
| Hydro | 185.026972 | 22.024835 | 78.333180 | 1988.709163 | 780.429852 | 709.991238 | 699.318079 | 693.884741 | 737.636455 |
| Nuclear | 975.555556 | 18.081478 | 76.124056 | 1996.555556 | 4867.556164 | 4234.875000 | 4272.608750 | 4323.415000 | 4277.031250 |
| Oil | 88.942000 | 17.565130 | 74.362490 | 2002.750000 | 130.881366 | 21.710086 | 4.796871 | 0.167471 | 0.295215 |
| Solar | 21.712598 | 23.934908 | 74.268528 | 2014.897638 | NaN | NaN | NaN | NaN | NaN |
| Wind | 33.429675 | 17.996636 | 73.877912 | 2015.000000 | NaN | NaN | NaN | NaN | NaN |
df.head()
| capacity_mw | latitude | longitude | primary_fuel | commissioning_year | generation_gwh_2014 | generation_gwh_2015 | generation_gwh_2016 | generation_gwh_2017 | generation_gwh_2018 | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2.5 | 28.1839 | 73.2407 | Solar | 2011.0 | NaN | NaN | NaN | NaN | NaN |
| 1 | 98.0 | 24.7663 | 74.6090 | Coal | 2015.0 | NaN | NaN | NaN | NaN | NaN |
| 2 | 39.2 | 21.9038 | 69.3732 | Wind | 2015.0 | NaN | NaN | NaN | NaN | NaN |
| 3 | 135.0 | 23.8712 | 91.3602 | Gas | 2004.0 | 617.789264 | 843.747 | 886.004428 | 663.774500 | 626.239128 |
| 4 | 1800.0 | 21.9603 | 82.4091 | Coal | 2015.0 | 3035.550000 | 5916.370 | 6243.000000 | 5385.579736 | 7279.000000 |
df['primary_fuel']
0 Solar
1 Coal
2 Wind
3 Gas
4 Coal
...
902 Coal
903 Solar
904 Wind
905 Coal
906 Wind
Name: primary_fuel, Length: 907, dtype: object
X= df.drop('primary_fuel', axis=1)
Y= df['primary_fuel']
X.shape , Y.shape
((907, 9), (907,))
from sklearn.preprocessing import LabelEncoder
le= LabelEncoder()
X['primary_fuel']=le.fit_transform(Y)
plt.figure(figsize=(12,8))
sns.heatmap(X.corr(),annot=True)
<AxesSubplot:>
#Low correlation exists between the target variable and mainly independent factors.
#generation columns and the target variable have a strong negative correlation.
#Despite the fact that there is some multicolinearity among the independent variables
df.groupby('primary_fuel')['generation_gwh_2014'].mean()
primary_fuel Biomass NaN Coal 4737.279310 Gas 756.601441 Hydro 780.429852 Nuclear 4867.556164 Oil 130.881366 Solar NaN Wind NaN Name: generation_gwh_2014, dtype: float64
df.head()
| capacity_mw | latitude | longitude | primary_fuel | commissioning_year | generation_gwh_2014 | generation_gwh_2015 | generation_gwh_2016 | generation_gwh_2017 | generation_gwh_2018 | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2.5 | 28.1839 | 73.2407 | Solar | 2011.0 | NaN | NaN | NaN | NaN | NaN |
| 1 | 98.0 | 24.7663 | 74.6090 | Coal | 2015.0 | NaN | NaN | NaN | NaN | NaN |
| 2 | 39.2 | 21.9038 | 69.3732 | Wind | 2015.0 | NaN | NaN | NaN | NaN | NaN |
| 3 | 135.0 | 23.8712 | 91.3602 | Gas | 2004.0 | 617.789264 | 843.747 | 886.004428 | 663.774500 | 626.239128 |
| 4 | 1800.0 | 21.9603 | 82.4091 | Coal | 2015.0 | 3035.550000 | 5916.370 | 6243.000000 | 5385.579736 | 7279.000000 |
df[df['primary_fuel']=='Solar'].index
Int64Index([ 0, 25, 26, 28, 32, 33, 34, 35, 36, 37,
...
771, 779, 827, 829, 832, 835, 887, 894, 895, 903],
dtype='int64', length=127)
df.drop(df[df['primary_fuel']=='Solar'].index, inplace = True)
df.drop(df[df['primary_fuel']=='Wind'].index, inplace = True)
df.drop(df[df['primary_fuel']=='Biomass'].index, inplace = True)
df.shape
(607, 10)
df.isnull().sum()
capacity_mw 0 latitude 0 longitude 0 primary_fuel 0 commissioning_year 0 generation_gwh_2014 209 generation_gwh_2015 185 generation_gwh_2016 173 generation_gwh_2017 167 generation_gwh_2018 159 dtype: int64
# Will now fill in the gaps in the generation column group using the average fuel type.
df.groupby('primary_fuel')['generation_gwh_2014'].mean()
primary_fuel Coal 4737.279310 Gas 756.601441 Hydro 780.429852 Nuclear 4867.556164 Oil 130.881366 Name: generation_gwh_2014, dtype: float64
df['generation_gwh_2014']=np.where(df['primary_fuel']=='Coal',df['generation_gwh_2014'].fillna(4737.279310),df['generation_gwh_2014'] )
df['generation_gwh_2014']=np.where(df['primary_fuel']=='Gas',df['generation_gwh_2014'].fillna(7756.601441),df['generation_gwh_2014'] )
df['generation_gwh_2014']=np.where(df['primary_fuel']=='Hydro',df['generation_gwh_2014'].fillna(780.429852),df['generation_gwh_2014'] )
df['generation_gwh_2014']=np.where(df['primary_fuel']=='Nuclear',df['generation_gwh_2014'].fillna(4867.556164),df['generation_gwh_2014'] )
df['generation_gwh_2014']=np.where(df['primary_fuel']=='Oil',df['generation_gwh_2014'].fillna(130.881366),df['generation_gwh_2014'] )
df.groupby('primary_fuel')['generation_gwh_2015'].mean()
primary_fuel Coal 4649.585751 Gas 840.159648 Hydro 709.991238 Nuclear 4234.875000 Oil 21.710086 Name: generation_gwh_2015, dtype: float64
df['generation_gwh_2015']=np.where(df['primary_fuel']=='Coal',df['generation_gwh_2015'].fillna(4649.585751),df['generation_gwh_2015'] )
df['generation_gwh_2015']=np.where(df['primary_fuel']=='Gas',df['generation_gwh_2015'].fillna(840.159648),df['generation_gwh_2015'] )
df['generation_gwh_2015']=np.where(df['primary_fuel']=='Hydro',df['generation_gwh_2015'].fillna(709.991238),df['generation_gwh_2015'] )
df['generation_gwh_2015']=np.where(df['primary_fuel']=='Nuclear',df['generation_gwh_2015'].fillna(4234.875000),df['generation_gwh_2015'] )
df['generation_gwh_2015']=np.where(df['primary_fuel']=='Oil',df['generation_gwh_2015'].fillna(21.710086),df['generation_gwh_2015'] )
df.groupby('primary_fuel')['generation_gwh_2016'].mean()
primary_fuel Coal 4761.233946 Gas 826.305503 Hydro 699.318079 Nuclear 4272.608750 Oil 4.796871 Name: generation_gwh_2016, dtype: float64
df['generation_gwh_2016']=np.where(df['primary_fuel']=='Coal',df['generation_gwh_2016'].fillna(4761.233946),df['generation_gwh_2016'] )
df['generation_gwh_2016']=np.where(df['primary_fuel']=='Gas',df['generation_gwh_2016'].fillna(826.305503),df['generation_gwh_2016'] )
df['generation_gwh_2016']=np.where(df['primary_fuel']=='Hydro',df['generation_gwh_2016'].fillna(699.318079),df['generation_gwh_2016'] )
df['generation_gwh_2016']=np.where(df['primary_fuel']=='Nuclear',df['generation_gwh_2016'].fillna(4272.608750),df['generation_gwh_2016'] )
df['generation_gwh_2016']=np.where(df['primary_fuel']=='Oil',df['generation_gwh_2016'].fillna(4.796871),df['generation_gwh_2016'] )
df.groupby('primary_fuel')['generation_gwh_2017'].mean()
primary_fuel Coal 4870.537285 Gas 881.216187 Hydro 693.884741 Nuclear 4323.415000 Oil 0.167471 Name: generation_gwh_2017, dtype: float64
df['generation_gwh_2017']=np.where(df['primary_fuel']=='Coal',df['generation_gwh_2017'].fillna(4870.537285),df['generation_gwh_2017'] )
df['generation_gwh_2017']=np.where(df['primary_fuel']=='Gas',df['generation_gwh_2017'].fillna(881.216187),df['generation_gwh_2017'] )
df['generation_gwh_2017']=np.where(df['primary_fuel']=='Hydro',df['generation_gwh_2017'].fillna(693.884741),df['generation_gwh_2017'] )
df['generation_gwh_2017']=np.where(df['primary_fuel']=='Nuclear',df['generation_gwh_2017'].fillna(4323.415000),df['generation_gwh_2017'] )
df['generation_gwh_2017']=np.where(df['primary_fuel']=='Oil',df['generation_gwh_2017'].fillna(0.167471),df['generation_gwh_2017'] )
df.groupby('primary_fuel')['generation_gwh_2018'].mean()
primary_fuel Coal 5036.420635 Gas 846.658418 Hydro 737.636455 Nuclear 4277.031250 Oil 0.295215 Name: generation_gwh_2018, dtype: float64
df['generation_gwh_2018']=np.where(df['primary_fuel']=='Coal',df['generation_gwh_2018'].fillna(5036.420635),df['generation_gwh_2018'] )
df['generation_gwh_2018']=np.where(df['primary_fuel']=='Gas',df['generation_gwh_2018'].fillna(846.658418),df['generation_gwh_2018'] )
df['generation_gwh_2018']=np.where(df['primary_fuel']=='Hydro',df['generation_gwh_2018'].fillna(737.636455),df['generation_gwh_2018'] )
df['generation_gwh_2018']=np.where(df['primary_fuel']=='Nuclear',df['generation_gwh_2018'].fillna(4277.031250),df['generation_gwh_2018'] )
df['generation_gwh_2018']=np.where(df['primary_fuel']=='Oil',df['generation_gwh_2018'].fillna(0.295215),df['generation_gwh_2018'] )
df.isnull().sum()
capacity_mw 0 latitude 0 longitude 0 primary_fuel 0 commissioning_year 0 generation_gwh_2014 0 generation_gwh_2015 0 generation_gwh_2016 0 generation_gwh_2017 0 generation_gwh_2018 0 dtype: int64
df.head()
| capacity_mw | latitude | longitude | primary_fuel | commissioning_year | generation_gwh_2014 | generation_gwh_2015 | generation_gwh_2016 | generation_gwh_2017 | generation_gwh_2018 | |
|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 98.0 | 24.7663 | 74.6090 | Coal | 2015.0 | 4737.279310 | 4649.585751 | 4761.233946 | 4870.537285 | 5036.420635 |
| 3 | 135.0 | 23.8712 | 91.3602 | Gas | 2004.0 | 617.789264 | 843.747000 | 886.004428 | 663.774500 | 626.239128 |
| 4 | 1800.0 | 21.9603 | 82.4091 | Coal | 2015.0 | 3035.550000 | 5916.370000 | 6243.000000 | 5385.579736 | 7279.000000 |
| 5 | 250.0 | 23.7689 | 68.6447 | Coal | 2005.0 | 1153.421000 | 1208.852000 | 1175.765000 | 1147.913000 | 976.655000 |
| 6 | 60.0 | 10.4547 | 77.0078 | Hydro | 1970.0 | 157.558250 | 152.195200 | 61.421350 | 89.629600 | 48.327150 |
X= df.drop('primary_fuel',axis=1)
Y = df['primary_fuel']
X.shape , Y.shape
((607, 9), (607,))
X.head()
| capacity_mw | latitude | longitude | commissioning_year | generation_gwh_2014 | generation_gwh_2015 | generation_gwh_2016 | generation_gwh_2017 | generation_gwh_2018 | |
|---|---|---|---|---|---|---|---|---|---|
| 1 | 98.0 | 24.7663 | 74.6090 | 2015.0 | 4737.279310 | 4649.585751 | 4761.233946 | 4870.537285 | 5036.420635 |
| 3 | 135.0 | 23.8712 | 91.3602 | 2004.0 | 617.789264 | 843.747000 | 886.004428 | 663.774500 | 626.239128 |
| 4 | 1800.0 | 21.9603 | 82.4091 | 2015.0 | 3035.550000 | 5916.370000 | 6243.000000 | 5385.579736 | 7279.000000 |
| 5 | 250.0 | 23.7689 | 68.6447 | 2005.0 | 1153.421000 | 1208.852000 | 1175.765000 | 1147.913000 | 976.655000 |
| 6 | 60.0 | 10.4547 | 77.0078 | 1970.0 | 157.558250 | 152.195200 | 61.421350 | 89.629600 | 48.327150 |
X.skew()
capacity_mw 2.523310 latitude -0.058579 longitude 0.986666 commissioning_year -1.198052 generation_gwh_2014 3.078832 generation_gwh_2015 3.826520 generation_gwh_2016 3.622211 generation_gwh_2017 3.673231 generation_gwh_2018 3.559440 dtype: float64
for i in X.columns:
sns.boxplot(X[i])
plt.show()
#We won't deal with longitude outliers because those are places.
#Data by the commissioning year are categorical.
for i in X.columns:
sns.distplot(X[i])
plt.show()
col=['capacity_mw','generation_gwh_2014','generation_gwh_2015','generation_gwh_2016','generation_gwh_2017','generation_gwh_2018']
for i in col:
IQR= X[i].quantile(.75)-X[i].quantile(.25)
upper= (X[i].quantile(.75) +(1.5* IQR))
X[i]=np.where(X[i]>upper,upper,X[i])
for i in X.columns:
sns.boxplot(X[i])
plt.show()
X.skew()
capacity_mw 1.109633 latitude -0.058579 longitude 0.986666 commissioning_year -1.198052 generation_gwh_2014 1.282673 generation_gwh_2015 1.518384 generation_gwh_2016 1.476022 generation_gwh_2017 1.409096 generation_gwh_2018 1.454508 dtype: float64
# By managing outliers, skewness has been reduced.
v=X.copy()
v['Total']=X['generation_gwh_2014']+X['generation_gwh_2015']+X['generation_gwh_2016']+X['generation_gwh_2017']+X['generation_gwh_2018']
v.drop(['generation_gwh_2014','generation_gwh_2015','generation_gwh_2016','generation_gwh_2017','generation_gwh_2018'],axis=1,inplace=True)
v
| capacity_mw | latitude | longitude | commissioning_year | Total | |
|---|---|---|---|---|---|
| 1 | 98.0 | 24.7663 | 74.6090 | 2015.0 | 24055.056927 |
| 3 | 135.0 | 23.8712 | 91.3602 | 2004.0 | 3637.554320 |
| 4 | 1453.5 | 21.9603 | 82.4091 | 2015.0 | 27859.499736 |
| 5 | 250.0 | 23.7689 | 68.6447 | 2005.0 | 5662.606000 |
| 6 | 60.0 | 10.4547 | 77.0078 | 1970.0 | 509.131550 |
| ... | ... | ... | ... | ... | ... |
| 893 | 62.4 | 30.3033 | 77.5684 | 1991.0 | 984.064550 |
| 899 | 600.0 | 30.1081 | 77.3257 | 2007.0 | 16223.198930 |
| 901 | 106.6 | 13.1156 | 77.5838 | 1993.0 | 0.167471 |
| 902 | 1453.5 | 16.2949 | 77.3568 | 2016.0 | 6523.770835 |
| 905 | 80.0 | 24.3500 | 73.7477 | 2015.0 | 24055.056927 |
607 rows × 5 columns
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
scaled=sc.fit_transform(v)
scaled
array([[-0.6584317 , 0.53608541, -0.77608434, 0.91147876, 0.96271486],
[-0.58105619, 0.39288959, 2.50590045, 0.26575529, -0.62323304],
[ 2.17623053, 0.08718866, 0.75215301, 0.91147876, 1.25822839],
...,
[-0.64044712, -1.32776397, -0.1932456 , -0.37996817, -0.90577034],
[ 2.17623053, -0.81914763, -0.23772065, 0.97018089, -0.39904358],
[-0.69607384, 0.4694868 , -0.94483485, 0.91147876, 0.96271486]])
from statsmodels.stats.outliers_influence import variance_inflation_factor
VIF= pd.DataFrame()
VIF['features']=v.columns
VIF['vif']= [variance_inflation_factor(scaled,i) for i in range(len(v.columns))]
VIF
| features | vif | |
|---|---|---|
| 0 | capacity_mw | 1.789803 |
| 1 | latitude | 1.038009 |
| 2 | longitude | 1.043452 |
| 3 | commissioning_year | 1.127938 |
| 4 | Total | 1.888203 |
# Controlled multicolinearity
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
Y=le.fit_transform(Y)
from sklearn.preprocessing import power_transform
transformed=power_transform(scaled)
inp=pd.DataFrame(transformed, columns=v.columns)
from imblearn.over_sampling import SMOTE
sm=SMOTE()
X,Y=sm.fit_resample(inp,Y)
X.shape , Y.shape
((1290, 5), (1290,))
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,f1_score
# Best Random_state to find
maxaccu=0
maxRS=0
for i in range(0,200):
x_train,x_test,y_train,y_test= train_test_split(X,Y,random_state=i,test_size=.20)
LR= LogisticRegression()
LR.fit(x_train,y_train)
pred= LR.predict(x_test)
acc=accuracy_score(y_test,pred)
if acc>maxaccu:
maxaccu=acc
maxRS=i
print("Best accuracy is ",maxaccu,"on Random State =",maxRS)
Best accuracy is 0.7906976744186046 on Random State = 84
#Random State=84
x_train,x_test,y_train,y_test= train_test_split(X,Y,random_state=84,test_size=.20)
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
LR_model= LogisticRegression()
RD_model= RidgeClassifier()
DT_model= DecisionTreeClassifier()
SV_model= SVC()
KNR_model= KNeighborsClassifier()
RFR_model= RandomForestClassifier()
XGB_model= XGBClassifier()
SGH_model= SGDClassifier()
Bag_model=BaggingClassifier()
ADA_model=AdaBoostClassifier()
GB_model= GradientBoostingClassifier()
model=[LR_model,RD_model,DT_model,SV_model,KNR_model,RFR_model,XGB_model,SGH_model,Bag_model,ADA_model,GB_model ]
for m in model:
m.fit(x_train,y_train)
m.score(x_train,y_train)
pred= m.predict(x_test)
print('Accuracy_Score of ',m, 'is', accuracy_score(y_test,pred)*100)
print('Confusion Matrix of ',m,' is \n', confusion_matrix(y_test,pred) )
print(classification_report(y_test,pred))
print('*'*50)
Accuracy_Score of LogisticRegression() is 75.5813953488372
Confusion Matrix of LogisticRegression() is
[[41 4 2 12 0]
[ 3 25 10 7 4]
[ 2 5 28 1 0]
[ 7 0 4 42 0]
[ 0 0 2 0 59]]
precision recall f1-score support
0 0.77 0.69 0.73 59
1 0.74 0.51 0.60 49
2 0.61 0.78 0.68 36
3 0.68 0.79 0.73 53
4 0.94 0.97 0.95 61
accuracy 0.76 258
macro avg 0.75 0.75 0.74 258
weighted avg 0.76 0.76 0.75 258
**************************************************
Accuracy_Score of RidgeClassifier() is 66.66666666666666
Confusion Matrix of RidgeClassifier() is
[[36 4 4 15 0]
[ 8 4 13 9 15]
[ 2 1 24 2 7]
[ 0 0 4 49 0]
[ 0 0 2 0 59]]
precision recall f1-score support
0 0.78 0.61 0.69 59
1 0.44 0.08 0.14 49
2 0.51 0.67 0.58 36
3 0.65 0.92 0.77 53
4 0.73 0.97 0.83 61
accuracy 0.67 258
macro avg 0.62 0.65 0.60 258
weighted avg 0.64 0.67 0.62 258
**************************************************
Accuracy_Score of DecisionTreeClassifier() is 87.20930232558139
Confusion Matrix of DecisionTreeClassifier() is
[[47 4 5 3 0]
[ 4 37 7 0 1]
[ 3 2 30 0 1]
[ 2 0 0 51 0]
[ 0 0 1 0 60]]
precision recall f1-score support
0 0.84 0.80 0.82 59
1 0.86 0.76 0.80 49
2 0.70 0.83 0.76 36
3 0.94 0.96 0.95 53
4 0.97 0.98 0.98 61
accuracy 0.87 258
macro avg 0.86 0.87 0.86 258
weighted avg 0.88 0.87 0.87 258
**************************************************
Accuracy_Score of SVC() is 88.75968992248062
Confusion Matrix of SVC() is
[[46 4 2 7 0]
[ 1 40 2 3 3]
[ 2 1 29 1 3]
[ 0 0 0 53 0]
[ 0 0 0 0 61]]
precision recall f1-score support
0 0.94 0.78 0.85 59
1 0.89 0.82 0.85 49
2 0.88 0.81 0.84 36
3 0.83 1.00 0.91 53
4 0.91 1.00 0.95 61
accuracy 0.89 258
macro avg 0.89 0.88 0.88 258
weighted avg 0.89 0.89 0.89 258
**************************************************
Accuracy_Score of KNeighborsClassifier() is 88.75968992248062
Confusion Matrix of KNeighborsClassifier() is
[[48 1 2 8 0]
[ 1 40 4 2 2]
[ 2 2 30 0 2]
[ 0 1 0 52 0]
[ 0 1 1 0 59]]
precision recall f1-score support
0 0.94 0.81 0.87 59
1 0.89 0.82 0.85 49
2 0.81 0.83 0.82 36
3 0.84 0.98 0.90 53
4 0.94 0.97 0.95 61
accuracy 0.89 258
macro avg 0.88 0.88 0.88 258
weighted avg 0.89 0.89 0.89 258
**************************************************
Accuracy_Score of RandomForestClassifier() is 94.18604651162791
Confusion Matrix of RandomForestClassifier() is
[[53 0 3 3 0]
[ 0 43 6 0 0]
[ 2 1 33 0 0]
[ 0 0 0 53 0]
[ 0 0 0 0 61]]
precision recall f1-score support
0 0.96 0.90 0.93 59
1 0.98 0.88 0.92 49
2 0.79 0.92 0.85 36
3 0.95 1.00 0.97 53
4 1.00 1.00 1.00 61
accuracy 0.94 258
macro avg 0.93 0.94 0.93 258
weighted avg 0.95 0.94 0.94 258
**************************************************
Accuracy_Score of XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
n_estimators=100, n_jobs=None, num_parallel_tree=None,
objective='multi:softprob', predictor=None, ...) is 92.63565891472868
Confusion Matrix of XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
n_estimators=100, n_jobs=None, num_parallel_tree=None,
objective='multi:softprob', predictor=None, ...) is
[[51 0 3 5 0]
[ 0 42 6 0 1]
[ 4 0 32 0 0]
[ 0 0 0 53 0]
[ 0 0 0 0 61]]
precision recall f1-score support
0 0.93 0.86 0.89 59
1 1.00 0.86 0.92 49
2 0.78 0.89 0.83 36
3 0.91 1.00 0.95 53
4 0.98 1.00 0.99 61
accuracy 0.93 258
macro avg 0.92 0.92 0.92 258
weighted avg 0.93 0.93 0.93 258
**************************************************
Accuracy_Score of SGDClassifier() is 57.36434108527132
Confusion Matrix of SGDClassifier() is
[[37 8 1 13 0]
[13 27 5 4 0]
[ 2 5 25 4 0]
[26 2 4 21 0]
[ 0 5 18 0 38]]
precision recall f1-score support
0 0.47 0.63 0.54 59
1 0.57 0.55 0.56 49
2 0.47 0.69 0.56 36
3 0.50 0.40 0.44 53
4 1.00 0.62 0.77 61
accuracy 0.57 258
macro avg 0.60 0.58 0.57 258
weighted avg 0.62 0.57 0.58 258
**************************************************
Accuracy_Score of BaggingClassifier() is 90.31007751937985
Confusion Matrix of BaggingClassifier() is
[[50 1 5 3 0]
[ 5 41 2 0 1]
[ 2 1 32 0 1]
[ 4 0 0 49 0]
[ 0 0 0 0 61]]
precision recall f1-score support
0 0.82 0.85 0.83 59
1 0.95 0.84 0.89 49
2 0.82 0.89 0.85 36
3 0.94 0.92 0.93 53
4 0.97 1.00 0.98 61
accuracy 0.90 258
macro avg 0.90 0.90 0.90 258
weighted avg 0.91 0.90 0.90 258
**************************************************
Accuracy_Score of AdaBoostClassifier() is 52.71317829457365
Confusion Matrix of AdaBoostClassifier() is
[[42 8 0 9 0]
[ 4 31 9 4 1]
[ 0 7 28 1 0]
[29 0 2 22 0]
[ 0 7 41 0 13]]
precision recall f1-score support
0 0.56 0.71 0.63 59
1 0.58 0.63 0.61 49
2 0.35 0.78 0.48 36
3 0.61 0.42 0.49 53
4 0.93 0.21 0.35 61
accuracy 0.53 258
macro avg 0.61 0.55 0.51 258
weighted avg 0.63 0.53 0.51 258
**************************************************
Accuracy_Score of GradientBoostingClassifier() is 90.31007751937985
Confusion Matrix of GradientBoostingClassifier() is
[[50 1 4 4 0]
[ 1 41 6 0 1]
[ 3 3 29 0 1]
[ 0 0 0 53 0]
[ 0 0 1 0 60]]
precision recall f1-score support
0 0.93 0.85 0.88 59
1 0.91 0.84 0.87 49
2 0.72 0.81 0.76 36
3 0.93 1.00 0.96 53
4 0.97 0.98 0.98 61
accuracy 0.90 258
macro avg 0.89 0.89 0.89 258
weighted avg 0.91 0.90 0.90 258
**************************************************
from sklearn.model_selection import cross_val_score
for i in model:
print('Accuracy_Score of ',i, 'is', accuracy_score(y_test,i.predict(x_test))*100)
print("cross Validation accuracy score of ",i ," is ",cross_val_score(i,X,Y,cv=5, scoring='accuracy').mean()*100)
print('*'*50)
Accuracy_Score of LogisticRegression() is 75.5813953488372
cross Validation accuracy score of LogisticRegression() is 72.24806201550386
**************************************************
Accuracy_Score of RidgeClassifier() is 66.66666666666666
cross Validation accuracy score of RidgeClassifier() is 63.100775193798455
**************************************************
Accuracy_Score of DecisionTreeClassifier() is 87.20930232558139
cross Validation accuracy score of DecisionTreeClassifier() is 87.67441860465117
**************************************************
Accuracy_Score of SVC() is 88.75968992248062
cross Validation accuracy score of SVC() is 86.2015503875969
**************************************************
Accuracy_Score of KNeighborsClassifier() is 88.75968992248062
cross Validation accuracy score of KNeighborsClassifier() is 87.90697674418604
**************************************************
Accuracy_Score of RandomForestClassifier() is 94.18604651162791
cross Validation accuracy score of RandomForestClassifier() is 91.86046511627907
**************************************************
Accuracy_Score of XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
n_estimators=100, n_jobs=None, num_parallel_tree=None,
objective='multi:softprob', predictor=None, ...) is 92.63565891472868
cross Validation accuracy score of XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
n_estimators=100, n_jobs=None, num_parallel_tree=None,
objective='multi:softprob', predictor=None, ...) is 91.7829457364341
**************************************************
Accuracy_Score of SGDClassifier() is 57.36434108527132
cross Validation accuracy score of SGDClassifier() is 65.42635658914729
**************************************************
Accuracy_Score of BaggingClassifier() is 90.31007751937985
cross Validation accuracy score of BaggingClassifier() is 89.53488372093024
**************************************************
Accuracy_Score of AdaBoostClassifier() is 52.71317829457365
cross Validation accuracy score of AdaBoostClassifier() is 53.4108527131783
**************************************************
Accuracy_Score of GradientBoostingClassifier() is 90.31007751937985
cross Validation accuracy score of GradientBoostingClassifier() is 90.31007751937985
**************************************************
#XGBoost performed a fantastic job with the generalization model.Accuracy_XGBClassifier's rating is 91.86.
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
params={"learning_rate" : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ] ,
"max_depth" : [ 3, 4, 5, 6, 8, 10, 12, 15],
"min_child_weight" : [ 1, 3, 5, 7 ],
"gamma" : [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
"colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7 ] }
GCV= RandomizedSearchCV(XGB_model,param_distributions=params,cv=5,scoring='accuracy', n_jobs=-1)
GCV.fit(x_train,y_train)
RandomizedSearchCV(cv=5,
estimator=XGBClassifier(base_score=None, booster=None,
callbacks=None,
colsample_bylevel=None,
colsample_bynode=None,
colsample_bytree=None,
early_stopping_rounds=None,
enable_categorical=False,
eval_metric=None, feature_types=None,
gamma=None, gpu_id=None,
grow_policy=None,
importance_type=None,
interaction_constraints=None,
learning_rate...
monotone_constraints=None,
n_estimators=100, n_jobs=None,
num_parallel_tree=None,
objective='multi:softprob',
predictor=None, ...),
n_jobs=-1,
param_distributions={'colsample_bytree': [0.3, 0.4, 0.5,
0.7],
'gamma': [0.0, 0.1, 0.2, 0.3, 0.4],
'learning_rate': [0.05, 0.1, 0.15, 0.2,
0.25, 0.3],
'max_depth': [3, 4, 5, 6, 8, 10, 12,
15],
'min_child_weight': [1, 3, 5, 7]},
scoring='accuracy')
GCV.best_estimator_
XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=0.5, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=0.0, gpu_id=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=0.3, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=15, max_leaves=None,
min_child_weight=3, missing=nan, monotone_constraints=None,
n_estimators=100, n_jobs=None, num_parallel_tree=None,
objective='multi:softprob', predictor=None, ...)
GCV.best_params_
{'min_child_weight': 3,
'max_depth': 15,
'learning_rate': 0.3,
'gamma': 0.0,
'colsample_bytree': 0.5}
xgb_predict=GCV.best_estimator_.predict(x_test)
print(classification_report(y_test,xgb_predict))
precision recall f1-score support
0 0.89 0.86 0.88 59
1 0.96 0.90 0.93 49
2 0.83 0.83 0.83 36
3 0.93 1.00 0.96 53
4 0.98 1.00 0.99 61
accuracy 0.93 258
macro avg 0.92 0.92 0.92 258
weighted avg 0.93 0.93 0.93 258
confusion_matrix(y_test,xgb_predict)
array([[51, 1, 3, 4, 0],
[ 2, 44, 3, 0, 0],
[ 4, 1, 30, 0, 1],
[ 0, 0, 0, 53, 0],
[ 0, 0, 0, 0, 61]], dtype=int64)
sns.heatmap(confusion_matrix(y_test,xgb_predict),annot=True,fmt='d')
<AxesSubplot:>
accuracy_score(y_test,xgb_predict)*100
92.63565891472868
y_pred_prob=GCV.predict_proba(x_test)
y_pred_prob
array([[1.3316079e-04, 1.4766480e-03, 4.7268691e-03, 2.6481887e-04,
9.9339855e-01],
[1.7658512e-03, 1.8155719e-03, 9.9225992e-01, 3.7373162e-03,
4.2135123e-04],
[1.4803427e-02, 6.0548531e-03, 2.8830727e-03, 8.0865400e-05,
9.7617775e-01],
...,
[2.5299259e-02, 4.4000108e-02, 9.2845052e-01, 4.7666061e-04,
1.7734175e-03],
[9.9485695e-01, 1.0733516e-03, 1.2106402e-03, 2.7829804e-03,
7.6079552e-05],
[3.4951095e-03, 9.9122214e-01, 7.4557174e-04, 1.6014204e-04,
4.3769712e-03]], dtype=float32)
from sklearn.metrics import roc_auc_score,roc_curve,plot_roc_curve
roc_auc_score(y_test,y_pred_prob,multi_class='ovr', average='weighted')
0.9959487409364643
# classes' auc-roc curve
fpr={}
tpr={}
threshold={}
n_class=5
for i in range(n_class):
fpr[i],tpr[i],threshold[i]=roc_curve(y_test,y_pred_prob[:,i],pos_label=i)
#plotting
plt.plot(fpr[0],tpr[0],linestyle='--', color='orange',label='Class 0 vs rest' )
plt.plot(fpr[1],tpr[1],linestyle='--', color='green',label='Class 1 vs rest' )
plt.plot(fpr[2],tpr[2],linestyle='--', color='yellow',label='Class 2 vs rest' )
plt.plot(fpr[3],tpr[3],linestyle='--', color='red',label='Class 3 vs rest' )
plt.plot(fpr[4],tpr[4],linestyle='--', color='blue',label='Class 4 vs rest' )
plt.title('Multi class ROC Curve')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.legend(loc='best')
plt.show()
import joblib
joblib.dump(GCV.best_estimator_,'Global_Power_Fuel_prediction.pkl')
['Global_Power_Fuel_prediction.pkl']
df.head()
| capacity_mw | latitude | longitude | primary_fuel | commissioning_year | generation_gwh_2014 | generation_gwh_2015 | generation_gwh_2016 | generation_gwh_2017 | generation_gwh_2018 | |
|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 98.0 | 24.7663 | 74.6090 | Coal | 2015.0 | 4737.279310 | 4649.585751 | 4761.233946 | 4870.537285 | 5036.420635 |
| 3 | 135.0 | 23.8712 | 91.3602 | Gas | 2004.0 | 617.789264 | 843.747000 | 886.004428 | 663.774500 | 626.239128 |
| 4 | 1800.0 | 21.9603 | 82.4091 | Coal | 2015.0 | 3035.550000 | 5916.370000 | 6243.000000 | 5385.579736 | 7279.000000 |
| 5 | 250.0 | 23.7689 | 68.6447 | Coal | 2005.0 | 1153.421000 | 1208.852000 | 1175.765000 | 1147.913000 | 976.655000 |
| 6 | 60.0 | 10.4547 | 77.0078 | Hydro | 1970.0 | 157.558250 | 152.195200 | 61.421350 | 89.629600 | 48.327150 |
data=df.copy()
data.isnull().sum()
capacity_mw 0 latitude 0 longitude 0 primary_fuel 0 commissioning_year 0 generation_gwh_2014 0 generation_gwh_2015 0 generation_gwh_2016 0 generation_gwh_2017 0 generation_gwh_2018 0 dtype: int64
#additionally filled in upper stage misisng values
data['Total']=df['generation_gwh_2014']+df['generation_gwh_2015']+df['generation_gwh_2016']+df['generation_gwh_2017']+df['generation_gwh_2018']
data
| capacity_mw | latitude | longitude | primary_fuel | commissioning_year | generation_gwh_2014 | generation_gwh_2015 | generation_gwh_2016 | generation_gwh_2017 | generation_gwh_2018 | Total | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 98.0 | 24.7663 | 74.6090 | Coal | 2015.0 | 4737.279310 | 4649.585751 | 4761.233946 | 4870.537285 | 5036.420635 | 24055.056927 |
| 3 | 135.0 | 23.8712 | 91.3602 | Gas | 2004.0 | 617.789264 | 843.747000 | 886.004428 | 663.774500 | 626.239128 | 3637.554320 |
| 4 | 1800.0 | 21.9603 | 82.4091 | Coal | 2015.0 | 3035.550000 | 5916.370000 | 6243.000000 | 5385.579736 | 7279.000000 | 27859.499736 |
| 5 | 250.0 | 23.7689 | 68.6447 | Coal | 2005.0 | 1153.421000 | 1208.852000 | 1175.765000 | 1147.913000 | 976.655000 | 5662.606000 |
| 6 | 60.0 | 10.4547 | 77.0078 | Hydro | 1970.0 | 157.558250 | 152.195200 | 61.421350 | 89.629600 | 48.327150 | 509.131550 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 893 | 62.4 | 30.3033 | 77.5684 | Hydro | 1991.0 | 184.453100 | 183.000000 | 204.253600 | 175.866250 | 236.491600 | 984.064550 |
| 899 | 600.0 | 30.1081 | 77.3257 | Coal | 2007.0 | 3194.359820 | 3709.552200 | 3123.572061 | 3156.053669 | 3039.661180 | 16223.198930 |
| 901 | 106.6 | 13.1156 | 77.5838 | Oil | 1993.0 | 0.000000 | 0.000000 | 0.000000 | 0.167471 | 0.000000 | 0.167471 |
| 902 | 1600.0 | 16.2949 | 77.3568 | Coal | 2016.0 | 4737.279310 | 0.994875 | 233.596650 | 865.400000 | 686.500000 | 6523.770835 |
| 905 | 80.0 | 24.3500 | 73.7477 | Coal | 2015.0 | 4737.279310 | 4649.585751 | 4761.233946 | 4870.537285 | 5036.420635 | 24055.056927 |
607 rows × 11 columns
data.drop(['generation_gwh_2014','generation_gwh_2015','generation_gwh_2016','generation_gwh_2017','generation_gwh_2018'],axis=1,inplace=True)
data.head()
| capacity_mw | latitude | longitude | primary_fuel | commissioning_year | Total | |
|---|---|---|---|---|---|---|
| 1 | 98.0 | 24.7663 | 74.6090 | Coal | 2015.0 | 24055.056927 |
| 3 | 135.0 | 23.8712 | 91.3602 | Gas | 2004.0 | 3637.554320 |
| 4 | 1800.0 | 21.9603 | 82.4091 | Coal | 2015.0 | 27859.499736 |
| 5 | 250.0 | 23.7689 | 68.6447 | Coal | 2005.0 | 5662.606000 |
| 6 | 60.0 | 10.4547 | 77.0078 | Hydro | 1970.0 | 509.131550 |
sns.heatmap(data.corr(),annot=True)
<AxesSubplot:>
col=['capacity_mw','Total']
for i in col:
sns.boxplot(data[i])
plt.show()
for i in col:
IQR= data[i].quantile(.75)-data[i].quantile(.25)
upper= (data[i].quantile(.75) +(1.5* IQR))
data[i]=np.where(data[i]>upper,upper,data[i])
for i in col:
sns.boxplot(data[i])
plt.show()
data.head()
| capacity_mw | latitude | longitude | primary_fuel | commissioning_year | Total | |
|---|---|---|---|---|---|---|
| 1 | 98.0 | 24.7663 | 74.6090 | Coal | 2015.0 | 24055.056927 |
| 3 | 135.0 | 23.8712 | 91.3602 | Gas | 2004.0 | 3637.554320 |
| 4 | 1453.5 | 21.9603 | 82.4091 | Coal | 2015.0 | 27859.499736 |
| 5 | 250.0 | 23.7689 | 68.6447 | Coal | 2005.0 | 5662.606000 |
| 6 | 60.0 | 10.4547 | 77.0078 | Hydro | 1970.0 | 509.131550 |
inp=data.drop('capacity_mw', axis=1)
out=data['capacity_mw']
inp.shape , out.shape
((607, 5), (607,))
inp.head()
| latitude | longitude | primary_fuel | commissioning_year | Total | |
|---|---|---|---|---|---|
| 1 | 24.7663 | 74.6090 | Coal | 2015.0 | 24055.056927 |
| 3 | 23.8712 | 91.3602 | Gas | 2004.0 | 3637.554320 |
| 4 | 21.9603 | 82.4091 | Coal | 2015.0 | 27859.499736 |
| 5 | 23.7689 | 68.6447 | Coal | 2005.0 | 5662.606000 |
| 6 | 10.4547 | 77.0078 | Hydro | 1970.0 | 509.131550 |
inp.skew()
latitude -0.058579 longitude 0.986666 commissioning_year -1.198052 Total 1.315095 dtype: float64
v=inp.drop('primary_fuel',axis=1)
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
scaled=sc.fit_transform(v)
scaled
array([[ 0.53608541, -0.77608434, 0.91147876, 1.00628113],
[ 0.39288959, 2.50590045, 0.26575529, -0.63446469],
[ 0.08718866, 0.75215301, 0.91147876, 1.31200528],
...,
[-1.32776397, -0.1932456 , -0.37996817, -0.92676426],
[-0.81914763, -0.23772065, 0.97018089, -0.40252899],
[ 0.4694868 , -0.94483485, 0.91147876, 1.00628113]])
from statsmodels.stats.outliers_influence import variance_inflation_factor
VIF= pd.DataFrame()
VIF['features']=v.columns
VIF['vif']= [variance_inflation_factor(scaled,i) for i in range(len(v.columns))]
VIF
| features | vif | |
|---|---|---|
| 0 | latitude | 1.038017 |
| 1 | longitude | 1.030960 |
| 2 | commissioning_year | 1.136174 |
| 3 | Total | 1.130474 |
# 3 Controlled multicolinearity
inp.head(2)
| latitude | longitude | primary_fuel | commissioning_year | Total | |
|---|---|---|---|---|---|
| 1 | 24.7663 | 74.6090 | Coal | 2015.0 | 24055.056927 |
| 3 | 23.8712 | 91.3602 | Gas | 2004.0 | 3637.554320 |
dummied=pd.get_dummies(inp, drop_first=True)
from sklearn.preprocessing import power_transform
transform=power_transform(dummied)
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
standard=sc.fit_transform(transform)
standard.shape , out.shape
((607, 8), (607,))
#It is a regression issue because the output variable's average price has continuous data.
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error
LR= LinearRegression()
# Finding the best random_state for the model during the train-test split
for i in range(0,200):
x_train,x_test,y_train,y_test= train_test_split(standard,out,random_state=i,test_size=.3)
LR.fit(x_train,y_train)
train_pred=LR.predict(x_train)
test_pred=LR.predict(x_test)
if round(r2_score(y_test,test_pred),2)==round(r2_score(y_train,train_pred),2):
print("At random state ", i, "The model performance very well")
print("At random state: ",i)
print("Test R2 score is: ", round(r2_score(y_test,test_pred),2))
print('Train R2 score is: ', round(r2_score(y_train,train_pred),2))
print('X'*50,'\n')
At random state 8 The model performance very well At random state: 8 Test R2 score is: 0.39 Train R2 score is: 0.39 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX At random state 17 The model performance very well At random state: 17 Test R2 score is: 0.39 Train R2 score is: 0.39 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX At random state 22 The model performance very well At random state: 22 Test R2 score is: 0.39 Train R2 score is: 0.39 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX At random state 25 The model performance very well At random state: 25 Test R2 score is: 0.39 Train R2 score is: 0.39 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX At random state 38 The model performance very well At random state: 38 Test R2 score is: 0.39 Train R2 score is: 0.39 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX At random state 44 The model performance very well At random state: 44 Test R2 score is: 0.39 Train R2 score is: 0.39 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX At random state 86 The model performance very well At random state: 86 Test R2 score is: 0.39 Train R2 score is: 0.39 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX At random state 97 The model performance very well At random state: 97 Test R2 score is: 0.39 Train R2 score is: 0.39 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX At random state 102 The model performance very well At random state: 102 Test R2 score is: 0.39 Train R2 score is: 0.39 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX At random state 155 The model performance very well At random state: 155 Test R2 score is: 0.39 Train R2 score is: 0.39 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX At random state 156 The model performance very well At random state: 156 Test R2 score is: 0.39 Train R2 score is: 0.39 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX At random state 158 The model performance very well At random state: 158 Test R2 score is: 0.39 Train R2 score is: 0.39 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX At random state 161 The model performance very well At random state: 161 Test R2 score is: 0.39 Train R2 score is: 0.39 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX At random state 182 The model performance very well At random state: 182 Test R2 score is: 0.39 Train R2 score is: 0.39 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX At random state 189 The model performance very well At random state: 189 Test R2 score is: 0.39 Train R2 score is: 0.39 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX At random state 195 The model performance very well At random state: 195 Test R2 score is: 0.39 Train R2 score is: 0.39 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
# Random State number 44 .
x_train,x_test,y_train,y_test= train_test_split(standard,out,random_state=44,test_size=.2)
LR.fit(x_train,y_train)
LinearRegression()
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import SGDRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
LR_model= LinearRegression()
RD_model= Ridge()
LS_model= Lasso()
DT_model= DecisionTreeRegressor()
SV_model= SVR()
KNR_model= KNeighborsRegressor()
RFR_model= RandomForestRegressor()
XGB_model= XGBRegressor()
Elastic_model= ElasticNet()
SGH_model= SGDRegressor()
Bag_model=BaggingRegressor()
ADA_model=AdaBoostRegressor()
GB_model= GradientBoostingRegressor()
model=[LR_model,RD_model,LS_model,DT_model,SV_model,KNR_model,RFR_model,XGB_model,Elastic_model,SGH_model,Bag_model,ADA_model,GB_model ]
for m in model:
m.fit(x_train,y_train)
print('mean_absolute_error of ',m ,'model', mean_absolute_error(y_test,m.predict(x_test)))
print('mean_square_error of',m,'model' , mean_squared_error(y_test,m.predict(x_test)))
print('R2 Score of',m,'model', r2_score(y_test,m.predict(x_test) )*100)
print('X' * 50, '\n\n')
mean_absolute_error of LinearRegression() model 264.8930605103229
mean_square_error of LinearRegression() model 128645.41768614174
R2 Score of LinearRegression() model 39.27971970310852
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
mean_absolute_error of Ridge() model 264.9655942674523
mean_square_error of Ridge() model 128666.43799660975
R2 Score of Ridge() model 39.269798174876094
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
mean_absolute_error of Lasso() model 264.9012550849401
mean_square_error of Lasso() model 128798.33570679805
R2 Score of Lasso() model 39.20754282153974
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
mean_absolute_error of DecisionTreeRegressor() model 140.99401639344262
mean_square_error of DecisionTreeRegressor() model 63609.91194075411
R2 Score of DecisionTreeRegressor() model 69.97629801221244
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
mean_absolute_error of SVR() model 324.3736007248969
mean_square_error of SVR() model 237460.53747989514
R2 Score of SVR() model -12.080714996061491
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
mean_absolute_error of KNeighborsRegressor() model 165.2894245901639
mean_square_error of KNeighborsRegressor() model 76996.73324519048
R2 Score of KNeighborsRegressor() model 63.65775549037228
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
mean_absolute_error of RandomForestRegressor() model 121.60888797814209
mean_square_error of RandomForestRegressor() model 41387.81940775061
R2 Score of RandomForestRegressor() model 80.4650640456783
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
mean_absolute_error of XGBRegressor(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
n_estimators=100, n_jobs=None, num_parallel_tree=None,
predictor=None, random_state=None, ...) model 128.34053569193063
mean_square_error of XGBRegressor(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
n_estimators=100, n_jobs=None, num_parallel_tree=None,
predictor=None, random_state=None, ...) model 44688.59650091947
R2 Score of XGBRegressor(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
n_estimators=100, n_jobs=None, num_parallel_tree=None,
predictor=None, random_state=None, ...) model 78.90710641376522
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
mean_absolute_error of ElasticNet() model 286.0568805766707
mean_square_error of ElasticNet() model 137740.9384019941
R2 Score of ElasticNet() model 34.98665915539315
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
mean_absolute_error of SGDRegressor() model 264.6366785206043
mean_square_error of SGDRegressor() model 128751.41544346679
R2 Score of SGDRegressor() model 39.22968905567945
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
mean_absolute_error of BaggingRegressor() model 126.56472158469944
mean_square_error of BaggingRegressor() model 42678.60967488723
R2 Score of BaggingRegressor() model 79.85581462012743
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
mean_absolute_error of AdaBoostRegressor() model 192.86387809515398
mean_square_error of AdaBoostRegressor() model 55895.65946950447
R2 Score of AdaBoostRegressor() model 73.6174037799908
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
mean_absolute_error of GradientBoostingRegressor() model 130.07431736322366
mean_square_error of GradientBoostingRegressor() model 36493.2668319392
R2 Score of GradientBoostingRegressor() model 82.77527928440699
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
from sklearn.model_selection import cross_val_score
for i in model:
print('mean_square of ',i, 'model',mean_squared_error(y_test,i.predict(x_test)))
print("cross Validation score of ",i ," is ",cross_val_score(i,standard,out,cv=10, scoring='neg_mean_squared_error').mean())
print('*'*50)
mean_square of LinearRegression() model 128645.41768614174
cross Validation score of LinearRegression() is -143780.04095285514
**************************************************
mean_square of Ridge() model 128666.43799660975
cross Validation score of Ridge() is -143758.43453657435
**************************************************
mean_square of Lasso() model 128798.33570679805
cross Validation score of Lasso() is -143587.50585740217
**************************************************
mean_square of DecisionTreeRegressor() model 63609.91194075411
cross Validation score of DecisionTreeRegressor() is -79625.96326109182
**************************************************
mean_square of SVR() model 237460.53747989514
cross Validation score of SVR() is -265864.62185436086
**************************************************
mean_square of KNeighborsRegressor() model 76996.73324519048
cross Validation score of KNeighborsRegressor() is -109724.62242995563
**************************************************
mean_square of RandomForestRegressor() model 41387.81940775061
cross Validation score of RandomForestRegressor() is -45077.62639240614
**************************************************
mean_square of XGBRegressor(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
n_estimators=100, n_jobs=None, num_parallel_tree=None,
predictor=None, random_state=None, ...) model 44688.59650091947
cross Validation score of XGBRegressor(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
n_estimators=100, n_jobs=None, num_parallel_tree=None,
predictor=None, random_state=None, ...) is -51771.37845471575
**************************************************
mean_square of ElasticNet() model 137740.9384019941
cross Validation score of ElasticNet() is -150556.21358762536
**************************************************
mean_square of SGDRegressor() model 128751.41544346679
cross Validation score of SGDRegressor() is -143784.40641704988
**************************************************
mean_square of BaggingRegressor() model 42678.60967488723
cross Validation score of BaggingRegressor() is -49763.20175897897
**************************************************
mean_square of AdaBoostRegressor() model 55895.65946950447
cross Validation score of AdaBoostRegressor() is -66574.19803662592
**************************************************
mean_square of GradientBoostingRegressor() model 36493.2668319392
cross Validation score of GradientBoostingRegressor() is -49250.841263385446
**************************************************
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
n_estimator= [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000] # the default number of trees in a random forest is 100
max_features= ['auto','sqrt','log2'] # fewest attributes required to design a decision tree
max_depth=[10, 64, 118, 173, 227, 282, 336, 391, 445, 500] # Maximum decision tree depth
min_samples_split= [1,2,3] # Minimal number of samples needed to split the node
min_samples_leaf= [1,3,4,6,7,9] #each leaf node requires a minimum number of samples.
random_grid= {'n_estimators': n_estimator,
'max_features':max_features,
'max_depth':max_depth,
'min_samples_split':min_samples_split,
'min_samples_leaf':min_samples_leaf,
}
random_grid
{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000],
'max_features': ['auto', 'sqrt', 'log2'],
'max_depth': [10, 64, 118, 173, 227, 282, 336, 391, 445, 500],
'min_samples_split': [1, 2, 3],
'min_samples_leaf': [1, 3, 4, 6, 7, 9]}
randomCV =RandomizedSearchCV(RFR_model,param_distributions=random_grid,cv=5,random_state=100,verbose=2,n_jobs=-1)
randomCV.fit(x_train,y_train)
Fitting 5 folds for each of 10 candidates, totalling 50 fits
RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(), n_jobs=-1,
param_distributions={'max_depth': [10, 64, 118, 173, 227,
282, 336, 391, 445, 500],
'max_features': ['auto', 'sqrt',
'log2'],
'min_samples_leaf': [1, 3, 4, 6, 7, 9],
'min_samples_split': [1, 2, 3],
'n_estimators': [200, 400, 600, 800,
1000, 1200, 1400, 1600,
1800, 2000]},
random_state=100, verbose=2)
randomCV.best_estimator_
RandomForestRegressor(max_depth=445, min_samples_leaf=3, min_samples_split=3,
n_estimators=1400)
randomCV.best_params_
{'n_estimators': 1400,
'min_samples_split': 3,
'min_samples_leaf': 3,
'max_features': 'auto',
'max_depth': 445}
random_predict=randomCV.best_estimator_.predict(x_test)
print('mean_square_error of GrieSearchCV model' , mean_squared_error(y_test,random_predict))
print('R2 Score of Grid Search CV model', r2_score(y_test,random_predict )*100)
mean_square_error of GrieSearchCV model 37606.34510836825 R2 Score of Grid Search CV model 82.2499094255128
# Square Root Error is :
np.sqrt(mean_squared_error(y_test,random_predict))
193.9235548054136
import joblib
joblib.dump(randomCV.best_estimator_,'Global_Power_capacity_prediction.pkl')
['Global_Power_capacity_prediction.pkl']